From d2fab7315bac3a586f2f1b1c8d64f2441f89ca64 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 2 Mar 2018 11:18:12 +0000 Subject: COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part 4) Implemented Winograd Output Transform (2x2,3x3) on OpenCL Implemented CLWinogradConvolutionLayer on OpenCL Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148 Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- tests/validation/reference/Winograd.cpp | 218 +++++++++++++++++++++++++------- 1 file changed, 171 insertions(+), 47 deletions(-) (limited to 'tests/validation/reference/Winograd.cpp') diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp index 3ed55fb9fc..c760663b22 100644 --- a/tests/validation/reference/Winograd.cpp +++ b/tests/validation/reference/Winograd.cpp @@ -38,6 +38,87 @@ namespace reference { namespace { +template +void winograd_filter_transform3x3(const SimpleTensor &in, SimpleTensor &out) +{ + // Simple tensor for the 3x3 input tile + SimpleTensor input_tile{ TensorShape(3u, 3u), in.data_type(), 1 }; + + // Simple tensor for the transformation matrix + SimpleTensor trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 }; + + // Simple tensor for the transformation matrix transpose + SimpleTensor trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 }; + + // Simple tensor for the 4x3 temporary tile + SimpleTensor tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 }; + + // Simple tensor for the 4x4 output tile + SimpleTensor output_tile{ TensorShape(4u, 4u), in.data_type(), 1 }; + + // Initialize transformation matrix + // 1 | 0 | 0 + // 0.5 | 0.5 | 0.5 + // 0.5 |-0.5 | 0.5 + // 0 | 0 | 1 + trans_matrix[0 + 0 * 3] = 1.0f; + trans_matrix[1 + 0 * 3] = 0.0f; + trans_matrix[2 + 0 * 3] = 0.0f; + trans_matrix[0 + 1 * 3] = 0.5f; + trans_matrix[1 + 1 * 3] = 0.5f; + trans_matrix[2 + 1 * 3] = 0.5f; + trans_matrix[0 + 2 * 3] = 0.5f; + trans_matrix[1 + 2 * 3] = -0.5f; + trans_matrix[2 + 2 * 3] = 0.5f; + trans_matrix[0 + 3 * 3] = 0.0f; + trans_matrix[1 + 3 * 3] = 0.0f; + trans_matrix[2 + 3 * 3] = 1.0f; + + // Transpose the transformation matrix + transpose_matrix(trans_matrix, trans_matrix_transposed); + + const int num_channels = in.shape()[2]; + const int num_filters = in.shape()[3]; + const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters); + + for(int n = 0; n < num_batches; ++n) + { + for(int w = 0; w < num_filters; ++w) + { + for(int z = 0; z < num_channels; ++z) + { + // Load the 3x3 tile from the input tensor + get_tile(in, input_tile, Coordinates(0, 0, z, w, n)); + + // First transformation + matrix_multiply(trans_matrix, input_tile, tmp_tile); + + // Second transformation + matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile); + + // Store the 4x4 output tile across the 16 channels + const int output_offset = w + z * num_filters; + out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4]; + out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4]; + out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4]; + out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4]; + out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4]; + out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4]; + out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4]; + out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4]; + out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4]; + out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4]; + out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4]; + out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4]; + out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4]; + out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4]; + out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4]; + out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4]; + } + } + } +} + template void winograd_input_transform3x3(const SimpleTensor &src, SimpleTensor &dst, const PadStrideInfo &conv_info) { @@ -112,56 +193,70 @@ void winograd_input_transform3x3(const SimpleTensor &src, SimpleTensor &ds } template -void winograd_filter_transform3x3(const SimpleTensor &in, SimpleTensor &out) +void winograd_output_transform3x3(const SimpleTensor &in, SimpleTensor &out, int num_tiles_x) { + ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16); + ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]); + // Simple tensor for the 3x3 input tile - SimpleTensor input_tile{ TensorShape(3u, 3u), in.data_type(), 1 }; + SimpleTensor input_tile{ TensorShape(4u, 4u), in.data_type(), 1 }; // Simple tensor for the transformation matrix - SimpleTensor trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 }; + SimpleTensor trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 }; // Simple tensor for the transformation matrix transpose - SimpleTensor trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 }; + SimpleTensor trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 }; // Simple tensor for the 4x3 temporary tile - SimpleTensor tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 }; + SimpleTensor tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 }; // Simple tensor for the 4x4 output tile - SimpleTensor output_tile{ TensorShape(4u, 4u), in.data_type(), 1 }; + SimpleTensor output_tile{ TensorShape(2u, 2u), in.data_type(), 1 }; // Initialize transformation matrix - // 1 | 0 | 0 - // 0.5 | 0.5 | 0.5 - // 0.5 |-0.5 | 0.5 - // 0 | 0 | 1 - trans_matrix[0 + 0 * 3] = 1.0f; - trans_matrix[1 + 0 * 3] = 0.0f; - trans_matrix[2 + 0 * 3] = 0.0f; - trans_matrix[0 + 1 * 3] = 0.5f; - trans_matrix[1 + 1 * 3] = 0.5f; - trans_matrix[2 + 1 * 3] = 0.5f; - trans_matrix[0 + 2 * 3] = 0.5f; - trans_matrix[1 + 2 * 3] = -0.5f; - trans_matrix[2 + 2 * 3] = 0.5f; - trans_matrix[0 + 3 * 3] = 0.0f; - trans_matrix[1 + 3 * 3] = 0.0f; - trans_matrix[2 + 3 * 3] = 1.0f; + // 1 | 1 | 1 | 1 + // 0 | 1 | -1 | -1 + trans_matrix[0 + 0 * 4] = 1.0f; + trans_matrix[1 + 0 * 4] = 1.0f; + trans_matrix[2 + 0 * 4] = 1.0f; + trans_matrix[3 + 0 * 4] = 0.0f; + trans_matrix[0 + 1 * 4] = 0.0f; + trans_matrix[1 + 1 * 4] = 1.0f; + trans_matrix[2 + 1 * 4] = -1.0f; + trans_matrix[3 + 1 * 4] = -1.0f; // Transpose the transformation matrix transpose_matrix(trans_matrix, trans_matrix_transposed); - const int num_channels = in.shape()[2]; - const int num_filters = in.shape()[3]; - const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters); + const int w_in = in.shape()[0]; + const int h_in = in.shape()[1]; + const int c_in = in.shape()[2]; + const int w_out = out.shape()[0]; + const int h_out = out.shape()[1]; + const int c_out = out.shape()[2]; + const int num_batches = in.shape().total_size() / (w_in * h_in * c_in); + + // Input strides + const int stridey_in = w_in; + const int stridez_in = stridey_in * h_in; + const int stridew_in = stridez_in * c_in; + + // Output strides + const int stridey_out = w_out; + const int stridez_out = stridey_out * h_out; + const int stridew_out = stridez_out * c_out; for(int n = 0; n < num_batches; ++n) { - for(int w = 0; w < num_filters; ++w) + for(int y = 0; y < h_in; ++y) { - for(int z = 0; z < num_channels; ++z) + for(int x = 0; x < w_in; ++x) { - // Load the 3x3 tile from the input tensor - get_tile(in, input_tile, Coordinates(0, 0, z, w, n)); + // Load the 4x4 tile across the 16 channels of the input tensor + for(int z = 0; z < c_in; ++z) + { + input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)]; + } // First transformation matrix_multiply(trans_matrix, input_tile, tmp_tile); @@ -169,24 +264,29 @@ void winograd_filter_transform3x3(const SimpleTensor &in, SimpleTensor &ou // Second transformation matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile); - // Store the 4x4 output tile across the 16 channels - const int output_offset = w + z * num_filters; - out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4]; - out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4]; - out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4]; - out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4]; - out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4]; - out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4]; - out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4]; - out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4]; - out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4]; - out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4]; - out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4]; - out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4]; - out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4]; - out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4]; - out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4]; - out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4]; + // Store the 2x2 output tile + const int xo = (y % num_tiles_x) * 2; + const int yo = (y / num_tiles_x) * 2; + const int zo = x; + + const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out); + out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2]; + + // Check out-of-bound writes + if(xo + 1 < w_out) + { + out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2]; + } + + if(yo + 1 < h_out) + { + out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2]; + } + + if((yo + 1 < h_out) && (xo + 1 < w_out)) + { + out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2]; + } } } } @@ -234,8 +334,32 @@ SimpleTensor winograd_filter_transform(const SimpleTensor &in, const Tenso return out; } +template +SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles) +{ + ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format"); + ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height); + ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area()); + + // Create reference + SimpleTensor out{ output_shape, in.data_type(), 1 }; + + switch(kernel_dims.width) + { + case 3: + winograd_output_transform3x3(in, out, num_tiles.width); + break; + default: + ARM_COMPUTE_ERROR("Only supported 3x3 kernel"); + break; + } + + return out; +} + template SimpleTensor winograd_input_transform(const SimpleTensor &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims); template SimpleTensor winograd_filter_transform(const SimpleTensor &in, const TensorShape &output_shape); +template SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1