From f1c2bf0971dd1c996da149faf3dd669d566074c7 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 13 Jun 2018 14:05:54 +0100 Subject: COMPMID-1201 - Implementing Winograd Convolution Layer 1x3 and 3x1 kernels on OpenCL Change-Id: I39667bab49daa4da009694163274a59fd3574c73 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137595 Tested-by: Jenkins Reviewed-by: Giorgio Arena Reviewed-by: Georgios Pinitas --- arm_compute/core/CL/CLHelpers.h | 10 + arm_compute/core/Helpers.h | 22 + arm_compute/core/utils/misc/ShapeCalculator.h | 10 +- src/core/CL/CLHelpers.cpp | 38 + src/core/CL/CLKernelLibrary.cpp | 16 +- src/core/CL/cl_kernels/winograd.cl | 1251 +++++++++++++++++--- .../CL/kernels/CLWinogradFilterTransformKernel.cpp | 10 +- .../CL/kernels/CLWinogradInputTransformKernel.cpp | 35 +- .../CL/kernels/CLWinogradOutputTransformKernel.cpp | 35 +- .../CL/functions/CLWinogradConvolutionLayer.cpp | 22 +- tests/datasets/LargeConvolutionLayerDataset.h | 44 + tests/datasets/ShapeDatasets.h | 64 + tests/datasets/SmallConvolutionLayerDataset.h | 30 + tests/datasets/WinogradInputTransformDataset.h | 108 ++ tests/datasets/WinogradOutputTransformDataset.h | 85 +- tests/validation/CL/Winograd.cpp | 353 ++++-- tests/validation/Helpers.cpp | 31 +- tests/validation/Helpers.h | 9 + .../fixtures/WinogradConvolutionLayerFixture.h | 13 +- tests/validation/reference/Winograd.cpp | 130 +- 20 files changed, 1999 insertions(+), 317 deletions(-) diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index 1054f9a615..3b025cc5bb 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -109,5 +109,15 @@ bool arm_non_uniform_workgroup_supported(const cl::Device &device); * @return True if the extension is supported */ bool dot8_supported(const cl::Device &device); + +/** This function checks if the Winograd configuration (defined through the output tile, kernel size and the data layout) is supported on OpenCL + * + * @param[in] output_tile Output tile for the Winograd filtering algorithm + * @param[in] kernel_size Kernel size for the Winograd filtering algorithm + * @param[in] data_layout Data layout of the input tensor + * + * @return True if the configuration is supported + */ +bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout); } #endif /* __ARM_COMPUTE_CLHELPERS_H__ */ diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h index 7d922ae187..a3cbfb94e3 100644 --- a/arm_compute/core/Helpers.h +++ b/arm_compute/core/Helpers.h @@ -111,6 +111,28 @@ struct is_contained> : is_contained }; } +/** Calculate the number of output tiles required by Winograd Convolution layer. This utility function can be used by the Winograd input transform + * to know the number of tiles on the x and y direction + * + * @param[in] in_dims Spatial dimensions of the input tensor of convolution layer + * @param[in] kernel_size Kernel size + * @param[in] output_tile_size Size of a single output tile + * @param[in] conv_info Convolution info (i.e. pad, stride,...) + * + * @return the number of output tiles along the x and y directions of size "output_tile_size" + */ +inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info) +{ + int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast(output_tile_size.width)); + int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast(output_tile_size.height)); + + // Clamp in case we provide paddings but we have 1D convolution + num_tiles_x = std::min(num_tiles_x, static_cast(in_dims.width)); + num_tiles_y = std::min(num_tiles_y, static_cast(in_dims.height)); + + return Size2D(num_tiles_x, num_tiles_y); +} + /** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between * the real coordinates and the smallest following integer coordinates. Input must be in single channel format. * diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 115cbe688d..221387649f 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -255,12 +255,14 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp const size_t idx_h = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT); const size_t idx_c = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL); - // Compute height - const unsigned int num_tiles_x = std::ceil((input.tensor_shape()[idx_w] - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast(output_tile_size.width)); - const unsigned int num_tiles_y = std::ceil((input.tensor_shape()[idx_h] - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast(output_tile_size.height)); + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]), + kernel_size, + output_tile_size, + conv_info); const unsigned int width = input.tensor_shape()[idx_c]; - const unsigned int height = num_tiles_x * num_tiles_y; + const unsigned int height = num_tiles.area(); const unsigned int depth = input_tile_size.area(); TensorShape output_shape{ input.tensor_shape() }; diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 23c24c0337..df06aff647 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/Types.h" +#include #include namespace arm_compute @@ -164,4 +165,41 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n return (pos != std::string::npos); } +bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout) +{ + ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN); + + using WinogradConfiguration = std::pair, std::pair>; + + std::vector winograd_filter_transform_nchw = + { + WinogradConfiguration(std::pair(1, 2), std::pair(1, 3)), + WinogradConfiguration(std::pair(1, 4), std::pair(1, 3)), + WinogradConfiguration(std::pair(2, 1), std::pair(3, 1)), + WinogradConfiguration(std::pair(4, 1), std::pair(3, 1)), + WinogradConfiguration(std::pair(2, 2), std::pair(3, 3)), + WinogradConfiguration(std::pair(4, 4), std::pair(3, 3)), + WinogradConfiguration(std::pair(4, 4), std::pair(5, 5)) + }; + + std::vector winograd_filter_transform_nhwc = + { + WinogradConfiguration(std::pair(2, 2), std::pair(3, 3)), + WinogradConfiguration(std::pair(4, 4), std::pair(3, 3)), + WinogradConfiguration(std::pair(4, 4), std::pair(5, 5)) + }; + + auto p = std::make_pair(std::pair(output_tile.width, output_tile.height), + std::pair(kernel_size.width, kernel_size.height)); + + // Return true if supported + if(data_layout == DataLayout::NCHW) + { + return (std::find(winograd_filter_transform_nchw.begin(), winograd_filter_transform_nchw.end(), p) != winograd_filter_transform_nchw.end()); + } + else + { + return (std::find(winograd_filter_transform_nhwc.begin(), winograd_filter_transform_nhwc.end(), p) != winograd_filter_transform_nhwc.end()); + } +} } // namespace arm_compute diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index aa11edf9ec..2bcacad7f0 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -372,18 +372,32 @@ const std::map CLKernelLibrary::_kernel_program_map = { "warp_perspective_nearest_neighbour", "warp_perspective.cl" }, { "warp_perspective_bilinear", "warp_perspective.cl" }, { "winograd_filter_transform_2x2_3x3_nchw", "winograd.cl" }, + { "winograd_filter_transform_2x1_3x1_nchw", "winograd.cl" }, + { "winograd_filter_transform_1x2_1x3_nchw", "winograd.cl" }, { "winograd_filter_transform_4x4_3x3_nchw", "winograd.cl" }, + { "winograd_filter_transform_4x1_3x1_nchw", "winograd.cl" }, + { "winograd_filter_transform_1x4_1x3_nchw", "winograd.cl" }, { "winograd_filter_transform_4x4_5x5_nchw", "winograd.cl" }, { "winograd_filter_transform_4x4_3x3_nhwc", "winograd.cl" }, { "winograd_filter_transform_4x4_5x5_nhwc", "winograd.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl" }, { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" }, { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" }, + { "winograd_input_transform_2x1_3x1_stepz1_nchw", "winograd.cl" }, + { "winograd_input_transform_2x1_3x1_stepz2_nchw", "winograd.cl" }, + { "winograd_input_transform_1x2_1x3_stepz1_nchw", "winograd.cl" }, + { "winograd_input_transform_1x2_1x3_stepz2_nchw", "winograd.cl" }, { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl" }, + { "winograd_input_transform_4x1_3x1_stepz1_nchw", "winograd.cl" }, + { "winograd_input_transform_1x4_1x3_stepz1_nchw", "winograd.cl" }, + { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl" }, { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "winograd.cl" }, { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd.cl" }, { "winograd_output_transform_2x2_3x3_nchw", "winograd.cl" }, + { "winograd_output_transform_2x1_3x1_nchw", "winograd.cl" }, + { "winograd_output_transform_1x2_1x3_nchw", "winograd.cl" }, { "winograd_output_transform_4x4_3x3_nchw", "winograd.cl" }, + { "winograd_output_transform_4x1_3x1_nchw", "winograd.cl" }, + { "winograd_output_transform_1x4_1x3_nchw", "winograd.cl" }, { "winograd_output_transform_4x4_5x5_nchw", "winograd.cl" }, { "winograd_output_transform_4x4_3x3_nhwc", "winograd.cl" }, { "winograd_output_transform_4x4_5x5_nhwc", "winograd.cl" }, diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl index 93e038fff9..ce48d28b74 100644 --- a/src/core/CL/cl_kernels/winograd.cl +++ b/src/core/CL/cl_kernels/winograd.cl @@ -25,9 +25,11 @@ #if defined(SRC_DIM_Z) -/** This OpenCL kernel performs Winograd filter transform 3x3 when the data layout is NCHW and the output tile is 2x2 +/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2 * * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -57,39 +59,47 @@ __kernel void winograd_filter_transform_2x2_3x3_nchw( const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + float3 w0 = vload3(0, (__global float *)(src_addr)); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + float3 w0 = (float3)(*((__global float *)(src_addr + 0 * src_stride_y)), + *((__global float *)(src_addr + 1 * src_stride_y)), + *((__global float *)(src_addr + 2 * src_stride_y))); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y)); float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y)); float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y)); - - // Transform the 3x3 tile in a 4x4 tile - float4 out0 = 0.0f; - float4 out1 = 0.0f; - float4 out2 = 0.0f; - float4 out3 = 0.0f; +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) // Row 0 - out0.s0 = (w0.s0); - out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f; - out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f; - out0.s3 = (w0.s2); + float4 out0 = 0.0f; + out0.s0 = (w0.s0); + out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f; + out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f; + out0.s3 = (w0.s2); +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) // Row 1 - out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f; - out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f; - out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f; - out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f; + float4 out1 = 0.0f; + out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f; + out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f; + out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f; + out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f; // Row 2 - out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f; - out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f; - out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f; - out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f; + float4 out2 = 0.0f; + out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f; + out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f; + out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f; + out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f; // Row 3 - out3.s0 = (w2.s0); - out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f; - out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f; - out3.s3 = (w2.s2); + float4 out3 = 0.0f; + out3.s0 = (w2.s0); + out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f; + out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f; + out3.s3 = (w2.s2); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) int z = get_global_id(2); int x0 = z / SRC_DIM_Z; // idx filter @@ -98,11 +108,15 @@ __kernel void winograd_filter_transform_2x2_3x3_nchw( // Get output address __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; - // Store the 16 values across the 16 channels - *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0; - *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1; - *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2; - *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3; + // Store the values across the channels + // 16 channels for 3x3 kernels + // 4 channels for 3x1 or 1x3 kernels + *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) *(__global float *)(dst_addr + 4 * dst_stride_z) = out1.s0; *(__global float *)(dst_addr + 5 * dst_stride_z) = out1.s1; *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s2; @@ -115,11 +129,14 @@ __kernel void winograd_filter_transform_2x2_3x3_nchw( *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1; *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2; *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) } -/** This OpenCL kernel performs Winograd filter transform 3x3 when the data layout is NCHW and the output tile is 4x4 +/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4 * * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -149,65 +166,73 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw( const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + float3 w0 = vload3(0, (__global float *)(src_addr)); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + float3 w0 = (float3)(*((__global float *)(src_addr + 0 * src_stride_y)), + *((__global float *)(src_addr + 1 * src_stride_y)), + *((__global float *)(src_addr + 2 * src_stride_y))); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y)); float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y)); float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y)); - - // Transform the 3x3 tile in a 6x6 tile - float8 out0 = 0.0f; - float8 out1 = 0.0f; - float8 out2 = 0.0f; - float8 out3 = 0.0f; - float8 out4 = 0.0f; - float8 out5 = 0.0f; +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) // Row 0 - out0.s0 = (w0.s0) / 16.f; - out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f; - out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f; - out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f; - out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f; - out0.s5 = (w0.s2) / 4.f; - + float8 out0 = 0.0f; + out0.s0 = (w0.s0) / 16.f; + out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f; + out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f; + out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f; + out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f; + out0.s5 = (w0.s2) / 4.f; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) // Row 1 - out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f; - out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; - out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; - out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; - out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; - out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f; + float8 out1 = 0.0f; + out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f; + out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; + out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; + out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; + out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; + out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f; // Row 2 - out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f; - out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; - out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; - out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; - out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; - out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f; + float8 out2 = 0.0f; + out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f; + out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; + out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; + out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; + out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; + out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f; // Row 3 - out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f; - out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f; + float8 out3 = 0.0f; + out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f; + out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f; // Row 4 - out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f; - out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f; + float8 out4 = 0.0f; + out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f; + out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f; // Row 5 - out5.s0 = (w2.s0) / 4.f; - out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f; - out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f; - out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f; - out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f; - out5.s5 = (w2.s2); + float8 out5 = 0.0f; + out5.s0 = (w2.s0) / 4.f; + out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f; + out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f; + out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f; + out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f; + out5.s5 = (w2.s2); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) int z = get_global_id(2); int x0 = z / SRC_DIM_Z; // idx filter @@ -216,13 +241,17 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw( // Get output address __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; - // Store the 36 values across the 36 channels - *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0; - *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1; - *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2; - *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3; - *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4; - *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5; + // Store the values across the channels + // 36 channels for 3x3 kernels + // 6 channels for 3x1 or 1x3 kernels + *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4; + *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s0; *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s1; *(__global float *)(dst_addr + 8 * dst_stride_z) = out1.s2; @@ -253,7 +282,204 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw( *(__global float *)(dst_addr + 33 * dst_stride_z) = out5.s3; *(__global float *)(dst_addr + 34 * dst_stride_z) = out5.s4; *(__global float *)(dst_addr + 35 * dst_stride_z) = out5.s5; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) +/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x2_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x4_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) /** This OpenCL kernel performs Winograd filter transform 3x3 when the data layout is NHWC and the output tile is 4x4 * @@ -928,11 +1154,15 @@ __kernel void winograd_filter_transform_4x4_5x5_nhwc( } #endif // defined(SRC_DIM_Z) -#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) -/** This OpenCL kernel computes the input transform when the kernel size is 3x3 and the output tile is 2x2 +#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) +/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2 * * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source image. Supported data types: F32 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) @@ -960,25 +1190,40 @@ __kernel void winograd_input_transform_2x2_3x3_stepz1_nchw( int z = get_global_id(2); // Compute input address - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 2 * src_stride_x + y * 2 * src_stride_y + z * src_stride_z; - - src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y); - + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z; + + src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y); + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + float4 in_row0 = vload4(0, (__global float *)(src_addr)); +#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + float4 in_row0 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)), + *((__global float *)(src_addr + 1 * src_stride_y)), + *((__global float *)(src_addr + 2 * src_stride_y)), + *((__global float *)(src_addr + 3 * src_stride_y))); +#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - float4 tmp0 = in_row0 - in_row2; - float4 tmp1 = in_row1 + in_row2; - float4 tmp2 = in_row2 - in_row1; - float4 tmp3 = in_row1 - in_row3; + float4 tmp0 = in_row0; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + tmp0 -= in_row2; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) float out00 = tmp0.s0 - tmp0.s2; float out01 = tmp0.s1 + tmp0.s2; float out02 = tmp0.s2 - tmp0.s1; float out03 = tmp0.s1 - tmp0.s3; +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + float4 tmp1 = in_row1 + in_row2; + float4 tmp2 = in_row2 - in_row1; + float4 tmp3 = in_row1 - in_row3; + float out10 = tmp1.s0 - tmp1.s2; float out11 = tmp1.s1 + tmp1.s2; float out12 = tmp1.s2 - tmp1.s1; @@ -993,13 +1238,16 @@ __kernel void winograd_input_transform_2x2_3x3_stepz1_nchw( float out31 = tmp3.s1 + tmp3.s2; float out32 = tmp3.s2 - tmp3.s1; float out33 = tmp3.s1 - tmp3.s3; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y; - *((__global float *)(dst_addr + 0 * dst_stride_z)) = out00; - *((__global float *)(dst_addr + 1 * dst_stride_z)) = out01; - *((__global float *)(dst_addr + 2 * dst_stride_z)) = out02; - *((__global float *)(dst_addr + 3 * dst_stride_z)) = out03; + *((__global float *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00; + *((__global float *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01; + *((__global float *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02; + *((__global float *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) *((__global float *)(dst_addr + 4 * dst_stride_z)) = out10; *((__global float *)(dst_addr + 5 * dst_stride_z)) = out11; *((__global float *)(dst_addr + 6 * dst_stride_z)) = out12; @@ -1012,12 +1260,17 @@ __kernel void winograd_input_transform_2x2_3x3_stepz1_nchw( *((__global float *)(dst_addr + 13 * dst_stride_z)) = out31; *((__global float *)(dst_addr + 14 * dst_stride_z)) = out32; *((__global float *)(dst_addr + 15 * dst_stride_z)) = out33; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) } -/** This OpenCL kernel computes the input transform when the kernel size is 3x3, the output tile is 2x2 and the number of channels is multiple of 2 +/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2 * * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source image. Supported data types: F32 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) @@ -1045,36 +1298,61 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw( int z = get_global_id(2) * 2; // Compute input address - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 2 * src_stride_x + y * 2 * src_stride_y + z * src_stride_z; - - src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y); - + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z; + + src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y); + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + float4 in_row0 = vload4(0, (__global float *)(src_addr)); +#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + float4 in_row0 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)), + *((__global float *)(src_addr + 1 * src_stride_y)), + *((__global float *)(src_addr + 2 * src_stride_y)), + *((__global float *)(src_addr + 3 * src_stride_y))); +#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) src_addr += src_stride_z; +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + float4 in_row4 = vload4(0, (__global float *)(src_addr)); +#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + float4 in_row4 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)), + *((__global float *)(src_addr + 1 * src_stride_y)), + *((__global float *)(src_addr + 2 * src_stride_y)), + *((__global float *)(src_addr + 3 * src_stride_y))); +#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) float4 in_row4 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); float4 in_row5 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); float4 in_row6 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); float4 in_row7 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - float4 tmp0 = in_row0 - in_row2; + float4 tmp0 = in_row0; + float4 tmp4 = in_row4; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + tmp0 -= in_row2; + tmp4 -= in_row6; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + float2 out00 = (float2)(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2); + float2 out01 = (float2)(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2); + float2 out02 = (float2)(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1); + float2 out03 = (float2)(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3); + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) float4 tmp1 = in_row1 + in_row2; float4 tmp2 = in_row2 - in_row1; float4 tmp3 = in_row1 - in_row3; - float4 tmp4 = in_row4 - in_row6; float4 tmp5 = in_row5 + in_row6; float4 tmp6 = in_row6 - in_row5; float4 tmp7 = in_row5 - in_row7; - float2 out00 = (float2)(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2); - float2 out01 = (float2)(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2); - float2 out02 = (float2)(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1); - float2 out03 = (float2)(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3); - float2 out10 = (float2)(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2); float2 out11 = (float2)(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2); float2 out12 = (float2)(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1); @@ -1089,13 +1367,16 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw( float2 out31 = (float2)(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2); float2 out32 = (float2)(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1); float2 out33 = (float2)(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y; vstore2(out00, 0, (__global float *)(dst_addr + 0 * dst_stride_z)); vstore2(out01, 0, (__global float *)(dst_addr + 1 * dst_stride_z)); vstore2(out02, 0, (__global float *)(dst_addr + 2 * dst_stride_z)); vstore2(out03, 0, (__global float *)(dst_addr + 3 * dst_stride_z)); + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) vstore2(out10, 0, (__global float *)(dst_addr + 4 * dst_stride_z)); vstore2(out11, 0, (__global float *)(dst_addr + 5 * dst_stride_z)); vstore2(out12, 0, (__global float *)(dst_addr + 6 * dst_stride_z)); @@ -1108,12 +1389,17 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw( vstore2(out31, 0, (__global float *)(dst_addr + 13 * dst_stride_z)); vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z)); vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) } /** This OpenCL kernel computes the input transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW * * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source image. Supported data types: F32 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) @@ -1141,14 +1427,45 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw( int z = get_global_id(2); // Compute input address - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 4 * src_stride_x + y * 4 * src_stride_y + z * src_stride_z; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z; - src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y); + src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y); +#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + // Row0 + float4 d00 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)), + *((__global float *)(src_addr + 1 * src_stride_y)), + *((__global float *)(src_addr + 2 * src_stride_y)), + *((__global float *)(src_addr + 3 * src_stride_y))); + float2 d01 = (float2)(*((__global float *)(src_addr + 4 * src_stride_y)), + *((__global float *)(src_addr + 5 * src_stride_y))); +#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + // Row0 + float4 d00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); + float2 d01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + float out0 = 0.0f; + float out1 = 0.0f; + float out2 = 0.0f; + float out3 = 0.0f; + float out4 = 0.0f; + float out5 = 0.0f; + + // Channels [0, 5]: [out00, out01, out02, out03, out04, out05] + out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0; + out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0; + out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0; + out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0; + out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0; + out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // Row4 float4 d40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); float2 d41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); + // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4 float k0 = d41.s0; float k1 = d41.s0; float k2 = d41.s0; @@ -1163,25 +1480,44 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw( k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2; k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1; - // Row0 - float4 d00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); - float2 d01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); + out0 += k0; + out1 += k1; + out2 += k2; + out3 += k3; + out4 += k4; + out5 += k5; // Row2 float4 d20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); float2 d21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); + out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0; + out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0; + out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0; + out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0; + out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0; + out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1; +#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + // Compute destination address - __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y); + __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y); uint dst_plane_stride = dst_stride_z / sizeof(float); - float out0 = k0; - float out1 = k1; - float out2 = k2; - float out3 = k3; - float out4 = k4; - float out5 = k5; + *(dst_addr) = out0; + dst_addr += dst_plane_stride; + *(dst_addr) = out1; + dst_addr += dst_plane_stride; + *(dst_addr) = out2; + dst_addr += dst_plane_stride; + *(dst_addr) = out3; + dst_addr += dst_plane_stride; + *(dst_addr) = out4; + dst_addr += dst_plane_stride; + *(dst_addr) = out5; + dst_addr += dst_plane_stride; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) float out6 = k0; float out7 = k1; float out8 = k2; @@ -1207,27 +1543,6 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw( float out28 = k4; float out29 = k5; - // Channels [0, 5]: [out00, out01, out02, out03, out04, out05] - out0 += 16.0f * d00.s0 - 20.0f * d00.s2 - 20.0f * d20.s0 + 25.0f * d20.s2 + 4.0f * d01.s0 - 5.0f * d21.s0; - out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0; - out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 - 20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0; - out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0; - out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 - 10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0; - out5 += 16.0f * d00.s1 - 20.0f * d00.s3 - 20.0f * d20.s1 + 4.0f * d01.s1 + 25.0f * d20.s3 - 5.0f * d21.s1; - - *(dst_addr) = out0; - dst_addr += dst_plane_stride; - *(dst_addr) = out1; - dst_addr += dst_plane_stride; - *(dst_addr) = out2; - dst_addr += dst_plane_stride; - *(dst_addr) = out3; - dst_addr += dst_plane_stride; - *(dst_addr) = out4; - dst_addr += dst_plane_stride; - *(dst_addr) = out5; - dst_addr += dst_plane_stride; - // Row1 float4 d10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); float2 d11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); @@ -1367,6 +1682,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw( dst_addr += dst_plane_stride; *(dst_addr) = out5; dst_addr += dst_plane_stride; +#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) } #if defined(SRC_DIM_1) && defined(SRC_DIM_2) @@ -1711,7 +2027,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( dst_addr += dst_plane_stride; } -#endif /* defined(SRC_DIM_1) && defined(SRC_DIM_2) */ +#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2) #define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \ ({ \ @@ -1733,7 +2049,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \ }) -/** This OpenCL kernel computes the input transform when the kernel size is 5x5 and the output tile is 4x4 +/** This OpenCL kernel computes the input transform when the kernel size is 5x5 and the output tile is 4x4 when the data layout is NCHW * * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). @@ -1882,14 +2198,61 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw( *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7; } -#if defined(SRC_DIM_1) && defined(SRC_DIM_2) +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) +/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} -/** This OpenCL kernel computes the input transform when the kernel size is 5x5, the output tile is 4x4 and data layout is NHWC +/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2 * * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). - * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112) - * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source image. Supported data types: F32 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) @@ -1908,15 +2271,253 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw( * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( +__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) { - int x = get_global_id(0); - int y = get_global_id(1); - int z = get_global_id(2); + winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} - // Compute input address +/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} +#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + +#if defined(SRC_DIM_1) && defined(SRC_DIM_2) +/** This OpenCL kernel computes the input transform when the kernel size is 5x5 and the output tile is 4x4 when the data layout is NHWC + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + + // Compute input address __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float); // Clamp coordinates. This clamp is valid for all rows @@ -2150,12 +2751,16 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7; } #endif // defined(SRC_DIM_1) && defined(SRC_DIM_2) -#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) +#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) -#if defined(NUM_TILES_X) -/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2, the filter size 3x3 and the data layout is NCHW +#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW * * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -2183,21 +2788,29 @@ __kernel void winograd_output_transform_2x2_3x3_nchw( #endif // defined(HAS_BIAS) ) { - // Each thread stores a 2x2 tile + // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0); - // Load the values across the 16 channels to compose the 4x4 tile + // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile float d00 = *((__global float *)(src_addr + 0 * src_stride_z)); float d01 = *((__global float *)(src_addr + 1 * src_stride_z)); float d02 = *((__global float *)(src_addr + 2 * src_stride_z)); float d03 = *((__global float *)(src_addr + 3 * src_stride_z)); - float d10 = *((__global float *)(src_addr + 4 * src_stride_z)); - float d11 = *((__global float *)(src_addr + 5 * src_stride_z)); - float d12 = *((__global float *)(src_addr + 6 * src_stride_z)); - float d13 = *((__global float *)(src_addr + 7 * src_stride_z)); +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + // Compute the 2x1 or 1x2 output tile + // out00 = d00 + d01 + d02 + // out01 = d01 - d02 - d03 + + float out00 = d00 + d01 + d02; + float out01 = d01 - d02 - d03; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + float d10 = *((__global float *)(src_addr + 4 * src_stride_z)); + float d11 = *((__global float *)(src_addr + 5 * src_stride_z)); + float d12 = *((__global float *)(src_addr + 6 * src_stride_z)); + float d13 = *((__global float *)(src_addr + 7 * src_stride_z)); float d20 = *((__global float *)(src_addr + 8 * src_stride_z)); float d21 = *((__global float *)(src_addr + 9 * src_stride_z)); @@ -2229,10 +2842,11 @@ __kernel void winograd_output_transform_2x2_3x3_nchw( out01 += k0 - k1 - (d03 + d23); out10 += -d20 - d30 + k2 + k3; out11 += k2 - k3 + d23 + d33; +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) int y_in = get_global_id(1); - int x_out = (y_in % NUM_TILES_X) * 2; - int y_out = (y_in / NUM_TILES_X) * 2; + int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H; int z_out = get_global_id(0); #if defined(HAS_BIAS) @@ -2243,21 +2857,37 @@ __kernel void winograd_output_transform_2x2_3x3_nchw( out00 += (float)b; out01 += (float)b; - out10 += (float)b; - out11 += (float)b; #endif // defined(HAS_BIAS) // Get output address - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z; - // Store the 2x2 output tile + // Store the output tile +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00; + *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y)); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + +#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#if defined(HAS_BIAS) + // Add bias + out10 += (float)b; + out11 += (float)b; +#endif // defined(HAS_BIAS) + vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y)); +#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) } /** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW * * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -2285,12 +2915,12 @@ __kernel void winograd_output_transform_4x4_3x3_nchw( #endif // defined(HAS_BIAS) ) { - // Each thread stores a 4x4 tile + // Each thread stores a 4x4/4x1 or 1x4 tile Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0); - // Load the values across the 36 channels to compose the 6x6 tile + // Load the values across the channels to compose the 6x6 or 6x1 tile float d00 = *((__global float *)(src_addr + 0 * src_stride_z)); float d01 = *((__global float *)(src_addr + 1 * src_stride_z)); float d02 = *((__global float *)(src_addr + 2 * src_stride_z)); @@ -2298,6 +2928,13 @@ __kernel void winograd_output_transform_4x4_3x3_nchw( float d04 = *((__global float *)(src_addr + 4 * src_stride_z)); float d05 = *((__global float *)(src_addr + 5 * src_stride_z)); +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + // Compute out00, out01, out02 and out03 + float out00 = d00 + d01 + d02 + d03 + d04; + float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04; + float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04; + float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) float d10 = *((__global float *)(src_addr + 6 * src_stride_z)); float d11 = *((__global float *)(src_addr + 7 * src_stride_z)); float d12 = *((__global float *)(src_addr + 8 * src_stride_z)); @@ -2388,10 +3025,11 @@ __kernel void winograd_output_transform_4x4_3x3_nchw( out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52; out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52; out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55; +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) int y_in = get_global_id(1); - int x_out = (y_in % NUM_TILES_X) * 4; - int y_out = (y_in / NUM_TILES_X) * 4; + int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H; int z_out = get_global_id(0); #if defined(HAS_BIAS) @@ -2404,7 +3042,24 @@ __kernel void winograd_output_transform_4x4_3x3_nchw( out01 += (float)b; out02 += (float)b; out03 += (float)b; +#endif // defined(HAS_BIAS) + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z; + + // Store the output tile +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00; + *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01; + *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02; + *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y)); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + +#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#if defined(HAS_BIAS) + // Add bias out10 += (float)b; out11 += (float)b; out12 += (float)b; @@ -2419,18 +3074,252 @@ __kernel void winograd_output_transform_4x4_3x3_nchw( out31 += (float)b; out32 += (float)b; out33 += (float)b; - #endif // defined(HAS_BIAS) - - // Get output address - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z; - - // Store the 4x4 output tile - vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y)); vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y)); vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y)); vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y)); +#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_2x1_3x1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x1_3x1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x2_1x3_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x4_1x3_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); } +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) /** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NHWC * @@ -2815,7 +3704,7 @@ __kernel void winograd_output_transform_4x4_5x5_nchw( *(__global float *)(dst_addr + 3 * dst_stride_x + 3 * dst_stride_y) = out_col3.s3; } -/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 5x5 and the data format is NHWC +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 5x5 and the data layout is NHWC * * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 * @@ -2990,4 +3879,4 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( *(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s3) = out_col2.s3; *(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s3) = out_col3.s3; } -#endif // defined(NUM_TILES_X) +#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp index 779df637f6..e6c713e5e7 100644 --- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp @@ -25,7 +25,6 @@ #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" @@ -54,12 +53,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for NHWC data layout"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) - && output_tile_size != Size2D(4U, 4U), - "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for 5x5 kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported"); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); @@ -115,6 +109,8 @@ void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTenso // Set build options CLBuildOptions build_opts; build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL"); const Size2D kernel_size = winograd_info.kernel_size; const Size2D output_tile_size = winograd_info.output_tile_size; diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp index 274c9e7c3d..bb484afafb 100644 --- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "support/ToolchainSupport.h" @@ -45,12 +46,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd input transform only supports 3x3 and 5x5 kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for NHWC data layout"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) - && output_tile_size != Size2D(4U, 4U), - "Winograd input transform only supports 2x2 or 4x4 output tile for 3x3 kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for 5x5 kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported"); + ARM_COMPUTE_UNUSED(conv_info); ARM_COMPUTE_UNUSED(output_tile_size); ARM_COMPUTE_UNUSED(kernel_size); @@ -131,8 +128,6 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right(); const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom(); - _input = input; - _output = output; if(input->info()->data_layout() == DataLayout::NCHW) { // Check if we need to extend the right or bottom border @@ -145,8 +140,17 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor { _border_size = BorderSize(1U, 0U, 1U, 0); } - _num_tiles_x = std::ceil(num_elements_x / static_cast(output_tile_size.width)); - _num_tiles_y = std::ceil(num_elements_y / static_cast(output_tile_size.height)); + + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input->info()->dimension(idx_w), input->info()->dimension(idx_h)), + kernel_size, + output_tile_size, + conv_info); + + _input = input; + _output = output; + _num_tiles_x = num_tiles.width; + _num_tiles_y = num_tiles.height; const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input->info(), winograd_info); @@ -159,6 +163,10 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); + build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); + build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL"); if(input->info()->data_layout() == DataLayout::NHWC) { @@ -169,8 +177,11 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor // Create kernel std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); + // Get the maximum dimension from the tile size + const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); + // Check optimized kernel if output_dims == 2x2 - if(output_tile_size == Size2D(2U, 2U)) + if(tile_max_dim == 2) { _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2; } @@ -199,6 +210,8 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor _config_id += support::cpp11::to_string(conv_info.pad_left()); _config_id += "_"; _config_id += support::cpp11::to_string(conv_info.pad_top()); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(input->info()->data_layout())); } Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp index 980498c4d1..40d5f6588f 100644 --- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp @@ -55,20 +55,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; const Size2D input_dimensions = winograd_info.input_dimensions; + const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && output_tile_size != Size2D(4U, 4U), "Only 4x4 output tile supported for NHWC data layout"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels"); // Compute number of elements to process in the X and Y direction - const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right(); - const int num_elements_y = input_dimensions.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom(); - const int num_tiles_x = std::ceil(num_elements_x / static_cast(output_tile_size.width)); - const int num_tiles_y = std::ceil(num_elements_y / static_cast(output_tile_size.height)); + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, + kernel_size, + output_tile_size, + conv_info); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast((num_tiles_x * num_tiles_y))); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast((num_tiles.area()))); if(bias != nullptr) { @@ -150,13 +149,21 @@ void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const IC const Size2D kernel_size = winograd_info.kernel_size; const Size2D output_tile_size = winograd_info.output_tile_size; const PadStrideInfo conv_info = winograd_info.convolution_info; - const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right(); - const int num_tiles_x = std::ceil(num_elements_x / static_cast(output_tile_size.width)); + + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, + kernel_size, + output_tile_size, + conv_info); // Set build options CLBuildOptions build_opts; build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS")); - build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles_x)); + build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width)); + build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); + build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); // Create kernel std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout)); @@ -179,6 +186,8 @@ void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const IC _config_id += support::cpp11::to_string(output->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(1)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout)); } Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info) diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index 49753ad080..11714fac41 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -37,11 +37,27 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims) { Size2D output_tile = Size2D{}; - if(kernel_dims == Size2D(3U, 3U)) + const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); + + // Check if the input spatial dimensions are smaller than 4 + const bool is_input_lt4 = (input_dims.width <= 4 && input_dims.height <= 4); + + if(kernel_max_dim == 3U) { - output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U); + if(kernel_dims == Size2D(3U, 3U)) + { + output_tile = is_input_lt4 ? Size2D(2U, 2U) : Size2D(4U, 4U); + } + else if(kernel_dims == Size2D(3U, 1U)) + { + output_tile = is_input_lt4 ? Size2D(2U, 1U) : Size2D(4U, 1U); + } + else + { + output_tile = is_input_lt4 ? Size2D(1U, 2U) : Size2D(1U, 4U); + } } - else if(kernel_dims == Size2D(5U, 5U)) + else if(kernel_max_dim == 5U) { output_tile = Size2D(4U, 4U); } diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h index 36b3d60d57..ae25c8cd66 100644 --- a/tests/datasets/LargeConvolutionLayerDataset.h +++ b/tests/datasets/LargeConvolutionLayerDataset.h @@ -59,6 +59,50 @@ public: } }; +class LargeWinogradConvolutionLayer3x1Dataset final : public ConvolutionLayerDataset +{ +public: + LargeWinogradConvolutionLayer3x1Dataset() + { + // Kernel size 3 + // Batch size 1 + add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 1U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 1U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 1U, 128U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 1U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 1U, 256U, 256U), TensorShape(256U), TensorShape(56U, 56U, 256U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 1U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 1U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 1U, 512U, 512U), TensorShape(512U), TensorShape(12U, 14U, 512U), PadStrideInfo(1, 1, 0, 0)); + // Batch size 3, 2 and 4 + add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 1U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 1U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 1U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 0)); + } +}; + +class LargeWinogradConvolutionLayer1x3Dataset final : public ConvolutionLayerDataset +{ +public: + LargeWinogradConvolutionLayer1x3Dataset() + { + // Kernel size 3 + // Batch size 1 + add_config(TensorShape(224U, 222U, 64U), TensorShape(1U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(112U, 113U, 64U), TensorShape(1U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(112U, 112U, 128U), TensorShape(1U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(53U, 56U, 125U), TensorShape(1U, 3U, 125U, 256U), TensorShape(256U), TensorShape(53U, 56U, 256U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(56U, 56U, 256U), TensorShape(1U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(28U, 28U, 257U), TensorShape(1U, 3U, 257U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(28U, 28U, 512U), TensorShape(1U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(14U, 14U, 512U), TensorShape(1U, 3U, 512U, 512U), TensorShape(512U), TensorShape(14U, 12U, 512U), PadStrideInfo(1, 1, 0, 0)); + // Batch size 3, 2 and 4 + add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(1U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(1U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(1U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 0, 1)); + } +}; + class LargeWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset { public: diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index a5620ff7cf..68263c7793 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -388,6 +388,38 @@ public: } }; +/** Data set containing small 3x1 tensor shapes. */ +class Small3x1Shapes final : public ShapeDataset +{ +public: + Small3x1Shapes() + : ShapeDataset("Shape", + { + TensorShape{ 3U, 1U, 7U, 4U }, + TensorShape{ 3U, 1U, 4U, 13U }, + TensorShape{ 3U, 1U, 9U, 2U }, + TensorShape{ 3U, 1U, 3U, 5U }, + }) + { + } +}; + +/** Data set containing small 1x3 tensor shapes. */ +class Small1x3Shapes final : public ShapeDataset +{ +public: + Small1x3Shapes() + : ShapeDataset("Shape", + { + TensorShape{ 1U, 3U, 7U, 4U }, + TensorShape{ 1U, 3U, 4U, 13U }, + TensorShape{ 1U, 3U, 9U, 2U }, + TensorShape{ 1U, 3U, 3U, 5U }, + }) + { + } +}; + /** Data set containing large 3x3 tensor shapes. */ class Large3x3Shapes final : public ShapeDataset { @@ -404,6 +436,38 @@ public: } }; +/** Data set containing large 3x1 tensor shapes. */ +class Large3x1Shapes final : public ShapeDataset +{ +public: + Large3x1Shapes() + : ShapeDataset("Shape", + { + TensorShape{ 3U, 1U, 32U, 64U }, + TensorShape{ 3U, 1U, 51U, 13U }, + TensorShape{ 3U, 1U, 53U, 47U }, + TensorShape{ 3U, 1U, 128U, 384U }, + }) + { + } +}; + +/** Data set containing large 1x3 tensor shapes. */ +class Large1x3Shapes final : public ShapeDataset +{ +public: + Large1x3Shapes() + : ShapeDataset("Shape", + { + TensorShape{ 1U, 3U, 32U, 64U }, + TensorShape{ 1U, 3U, 51U, 13U }, + TensorShape{ 1U, 3U, 53U, 47U }, + TensorShape{ 1U, 3U, 128U, 384U }, + }) + { + } +}; + /** Data set containing small 5x5 tensor shapes. */ class Small5x5Shapes final : public ShapeDataset { diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h index fed36de3dd..f05cc15c06 100644 --- a/tests/datasets/SmallConvolutionLayerDataset.h +++ b/tests/datasets/SmallConvolutionLayerDataset.h @@ -52,6 +52,36 @@ public: } }; +class SmallWinogradConvolutionLayer3x1Dataset final : public ConvolutionLayerDataset +{ +public: + SmallWinogradConvolutionLayer3x1Dataset() + { + // Channel size big enough to force multithreaded execution of the input transform + add_config(TensorShape(8U, 8U, 32U), TensorShape(3U, 1U, 32U, 1U), TensorShape(1U), TensorShape(6U, 8U, 1U), PadStrideInfo(1, 1, 0, 0)); + // Batch size 1 + add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 1U, 2U, 1U), TensorShape(1U), TensorShape(6U, 8U, 1U), PadStrideInfo(1, 1, 0, 0)); + // Batch size 4 + add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 1U, 5U, 21U), TensorShape(21U), TensorShape(21U, 27U, 21U, 4U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 1U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 1, 0)); + } +}; + +class SmallWinogradConvolutionLayer1x3Dataset final : public ConvolutionLayerDataset +{ +public: + SmallWinogradConvolutionLayer1x3Dataset() + { + // Channel size big enough to force multithreaded execution of the input transform + add_config(TensorShape(8U, 8U, 32U), TensorShape(1U, 3U, 32U, 1U), TensorShape(1U), TensorShape(8U, 6U, 1U), PadStrideInfo(1, 1, 0, 0)); + // Batch size 1 + add_config(TensorShape(8U, 8U, 2U), TensorShape(1U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 6U, 1U), PadStrideInfo(1, 1, 0, 0)); + // Batch size 4 + add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(1U, 3U, 5U, 21U), TensorShape(21U), TensorShape(23U, 25U, 21U, 4U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(8U, 8U, 2U), TensorShape(1U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 0, 1)); + } +}; + class SmallWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset { public: diff --git a/tests/datasets/WinogradInputTransformDataset.h b/tests/datasets/WinogradInputTransformDataset.h index e365f9657f..ca23984a1d 100644 --- a/tests/datasets/WinogradInputTransformDataset.h +++ b/tests/datasets/WinogradInputTransformDataset.h @@ -112,6 +112,36 @@ public: } }; +class SmallWinogradInputTransformDataset2x1_3x1 final : public WinogradInputTransformDataset +{ +public: + SmallWinogradInputTransformDataset2x1_3x1() + { + add_config(TensorShape(9U, 9U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(9U, 9U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(27U, 13U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 1U, 3U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(128U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(27U, 13U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 5U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(14U, 14U, 512U, 2U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + } +}; + +class SmallWinogradInputTransformDataset1x2_1x3 final : public WinogradInputTransformDataset +{ +public: + SmallWinogradInputTransformDataset1x2_1x3() + { + add_config(TensorShape(9U, 9U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 1U, 3U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 5U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(14U, 14U, 512U, 2U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + } +}; + class SmallWinogradInputTransformDataset4x4_3x3 final : public WinogradInputTransformDataset { public: @@ -127,6 +157,36 @@ public: } }; +class SmallWinogradInputTransformDataset4x1_3x1 final : public WinogradInputTransformDataset +{ +public: + SmallWinogradInputTransformDataset4x1_3x1() + { + add_config(TensorShape(9U, 9U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(9U, 9U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(27U, 13U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 1U, 3U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(128U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 4U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U, 4U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(27U, 13U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 5U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(14U, 14U, 512U, 2U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + } +}; + +class SmallWinogradInputTransformDataset1x4_1x3 final : public WinogradInputTransformDataset +{ +public: + SmallWinogradInputTransformDataset1x4_1x3() + { + add_config(TensorShape(9U, 9U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 1U, 3U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 4U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(27U, 13U, 2U, 4U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(9U, 9U, 3U, 5U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(14U, 14U, 512U, 2U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + } +}; + class SmallWinogradInputTransformDataset4x4_5x5 final : public WinogradInputTransformDataset { public: @@ -154,6 +214,30 @@ public: } }; +class LargeWinogradInputTransformDataset2x1_3x1 final : public WinogradInputTransformDataset +{ +public: + LargeWinogradInputTransformDataset2x1_3x1() + { + add_config(TensorShape(42U, 37U, 8U, 15U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(42U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(57U, 60U, 13U, 8U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(57U, 60U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 21U, 13U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(128U, 64U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(83U, 72U, 14U, 5U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(83U, 72U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + } +}; + +class LargeWinogradInputTransformDataset1x2_1x3 final : public WinogradInputTransformDataset +{ +public: + LargeWinogradInputTransformDataset1x2_1x3() + { + add_config(TensorShape(42U, 37U, 8U, 15U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(42U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(57U, 60U, 13U, 8U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(57U, 60U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 21U, 13U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(83U, 72U, 14U, 5U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(83U, 72U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + } +}; + class LargeWinogradInputTransformDataset4x4_3x3 final : public WinogradInputTransformDataset { public: @@ -166,6 +250,30 @@ public: } }; +class LargeWinogradInputTransformDataset4x1_3x1 final : public WinogradInputTransformDataset +{ +public: + LargeWinogradInputTransformDataset4x1_3x1() + { + add_config(TensorShape(42U, 37U, 8U, 15U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(42U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(57U, 60U, 13U, 8U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(57U, 60U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 21U, 13U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(128U, 64U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(83U, 72U, 14U, 5U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(83U, 72U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + } +}; + +class LargeWinogradInputTransformDataset1x4_1x3 final : public WinogradInputTransformDataset +{ +public: + LargeWinogradInputTransformDataset1x4_1x3() + { + add_config(TensorShape(42U, 37U, 8U, 15U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(42U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(57U, 60U, 13U, 8U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(57U, 60U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); + add_config(TensorShape(128U, 64U, 21U, 13U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(83U, 72U, 14U, 5U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(83U, 72U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + } +}; + class LargeWinogradInputTransformDataset4x4_5x5 final : public WinogradInputTransformDataset { public: diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h index c7ba3b2b7d..a4689c6ef1 100644 --- a/tests/datasets/WinogradOutputTransformDataset.h +++ b/tests/datasets/WinogradOutputTransformDataset.h @@ -99,12 +99,11 @@ private: std::vector _info{}; }; -class SmallWinogradOutputTransformDataset final : public WinogradOutputTransformDataset +class SmallWinogradOutputTransformDatasetNCHW final : public WinogradOutputTransformDataset { public: - SmallWinogradOutputTransformDataset() + SmallWinogradOutputTransformDatasetNCHW() { - // NCHW // (2x2, 3x3) add_config(TensorShape(13U, 6U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); add_config(TensorShape(7U, 20U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); @@ -120,6 +119,34 @@ public: add_config(TensorShape(24U, 16U, 36U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); add_config(TensorShape(7U, 12U, 16U, 5U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + // (2x1, 3x1) + add_config(TensorShape(13U, 18U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(7U, 44U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(1U, 891U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(53U, 33U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(7U, 30U, 4U, 3U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(24U, 98U, 4U, 2U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + + // (1x2, 1x3) + add_config(TensorShape(13U, 14U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(7U, 50U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(1U, 901U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(7U, 32U, 4U, 3U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(24U, 98U, 4U, 2U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + + // (4x1, 3x1) + add_config(TensorShape(13U, 12U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(7U, 22U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(1U, 462U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(53U, 33U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(7U, 20U, 6U, 3U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(24U, 56U, 6U, 2U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + + // (1x4, 1x3) + add_config(TensorShape(13U, 7U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(7U, 30U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(1U, 477U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(7U, 16U, 6U, 3U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(24U, 56U, 6U, 2U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + // (4x4, 5x5) add_config(TensorShape(13U, 1U, 64U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); add_config(TensorShape(7U, 4U, 64U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); @@ -127,8 +154,14 @@ public: add_config(TensorShape(7U, 2U, 64U, 3U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); add_config(TensorShape(24U, 9U, 64U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)); add_config(TensorShape(7U, 2U, 64U, 5U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + } +}; - // NHWC +class SmallWinogradOutputTransformDatasetNHWC final : public WinogradOutputTransformDataset +{ +public: + SmallWinogradOutputTransformDatasetNHWC() + { // (4x4, 3x3) add_config(TensorShape(13U, 4U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(10U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC)); add_config(TensorShape(13U, 6U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC)); @@ -146,10 +179,10 @@ public: } }; -class LargeWinogradOutputTransformDataset final : public WinogradOutputTransformDataset +class LargeWinogradOutputTransformDatasetNCHW final : public WinogradOutputTransformDataset { public: - LargeWinogradOutputTransformDataset() + LargeWinogradOutputTransformDatasetNCHW() { // NCHW // (2x2, 3x3) @@ -168,13 +201,51 @@ public: add_config(TensorShape(32U, 784U, 36U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); add_config(TensorShape(13U, 196U, 36U, 5U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + // (2x1, 3x1) + add_config(TensorShape(64U, 24976U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(224U, 223U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(32U, 6160U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(112U, 110U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(13U, 1568U, 4U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(64U, 24753U, 4U, 3U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(32U, 6050U, 4U, 2U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(13U, 1512U, 4U, 5U), WinogradInfo(Size2D(2U, 1U), Size2D(3U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + + // (1x2, 1x3) + add_config(TensorShape(64U, 25088U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(32U, 6160U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(13U, 1568U, 4U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(64U, 24864U, 4U, 3U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(32U, 6048U, 4U, 2U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(13U, 1512U, 4U, 5U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + + // (4x1, 3x1) + add_config(TensorShape(64U, 12488U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(224U, 223U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(32U, 3080U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(112U, 110U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(13U, 784U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); + add_config(TensorShape(64U, 12488U, 6U, 3U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(32U, 3080U, 6U, 2U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(13U, 784U, 6U, 5U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + + // (1x4, 1x3) + add_config(TensorShape(64U, 12544U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(32U, 3136U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(13U, 784U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + add_config(TensorShape(64U, 12544U, 6U, 3U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(32U, 3024U, 6U, 2U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + add_config(TensorShape(13U, 784U, 6U, 5U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW)); + // (4x4, 5x5) add_config(TensorShape(32U, 756U, 64U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); add_config(TensorShape(13U, 182U, 64U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); add_config(TensorShape(32U, 756U, 64U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW)); add_config(TensorShape(13U, 182U, 64U, 5U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW)); + } +}; - // NHWC +class LargeWinogradOutputTransformDatasetNHWC final : public WinogradOutputTransformDataset +{ +public: + LargeWinogradOutputTransformDatasetNHWC() + { // (4x4, 3x3) add_config(TensorShape(64U, 3136U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(224U, 224U), PadStrideInfo(1, 1, 1, 1), DataLayout::NHWC)); add_config(TensorShape(32U, 784U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC)); diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp index b869f4c314..f68ec8c286 100644 --- a/tests/validation/CL/Winograd.cpp +++ b/tests/validation/CL/Winograd.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h" #include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLTensor.h" @@ -51,12 +52,66 @@ namespace validation { namespace { +// *INDENT-OFF* +// clang-format off constexpr AbsoluteTolerance tolerance_f32(0.001f); constexpr AbsoluteTolerance tolerance_convolution_layer_f32(0.1f); -const auto SmallWinogradInputTransformDataset = framework::dataset::concat(datasets::SmallWinogradInputTransformDataset2x2_3x3(), - framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_3x3(), datasets::SmallWinogradInputTransformDataset4x4_5x5())); -const auto LargeWinogradInputTransformDataset = framework::dataset::concat(datasets::LargeWinogradInputTransformDataset2x2_3x3(), - framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_3x3(), datasets::LargeWinogradInputTransformDataset4x4_5x5())); + +// Input transform +const auto SmallWinogradInputTransformDatasetNCHW = + framework::dataset::concat(datasets::SmallWinogradInputTransformDataset2x2_3x3(), + framework::dataset::concat(datasets::SmallWinogradInputTransformDataset2x1_3x1(), + framework::dataset::concat(datasets::SmallWinogradInputTransformDataset1x2_1x3(), + framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_3x3(), + framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x1_3x1(), + framework::dataset::concat(datasets::SmallWinogradInputTransformDataset1x4_1x3(), + datasets::SmallWinogradInputTransformDataset4x4_5x5())))))); + +const auto SmallWinogradInputTransformDatasetNHWC = framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_3x3(), + datasets::SmallWinogradInputTransformDataset4x4_5x5()); + +const auto LargeWinogradInputTransformDatasetNCHW = + framework::dataset::concat(datasets::LargeWinogradInputTransformDataset2x2_3x3(), + framework::dataset::concat(datasets::LargeWinogradInputTransformDataset2x1_3x1(), + framework::dataset::concat(datasets::LargeWinogradInputTransformDataset1x2_1x3(), + framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_3x3(), + framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x1_3x1(), + framework::dataset::concat(datasets::LargeWinogradInputTransformDataset1x4_1x3(), + datasets::LargeWinogradInputTransformDataset4x4_5x5())))))); + +const auto LargeWinogradInputTransformDatasetNHWC = + framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_3x3(), + datasets::LargeWinogradInputTransformDataset4x4_5x5()); + +// Filter transform +const auto SmallWinogradFilterTransformDatasetNCHW = + framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 2U), Size2D(4U, 4U) })), + framework::dataset::concat(combine(datasets::Small3x1Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 1U), Size2D(4U, 1U) })), + framework::dataset::concat(combine(datasets::Small1x3Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 2U), Size2D(1U, 4U) })), + combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) }))))); + +const auto SmallWinogradFilterTransformDatasetNHWC = + framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })), + combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) }))); + +const auto LargeWinogradFilterTransformDatasetNCHW = + framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 2U), Size2D(4U, 4U) })), + framework::dataset::concat(combine(datasets::Large3x1Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 1U), Size2D(4U, 1U) })), + framework::dataset::concat(combine(datasets::Large1x3Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 2U), Size2D(1U, 4U) })), + combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) }))))); + +const auto LargeWinogradFilterTransformDatasetNHWC = + framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })), + combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) }))); + +// Output transform +const auto SmallWinogradOutputTransformDatasetNCHW = datasets::SmallWinogradOutputTransformDatasetNCHW(); + +const auto SmallWinogradOutputTransformDatasetNHWC = datasets::SmallWinogradOutputTransformDatasetNHWC(); + +const auto LargeWinogradOutputTransformDatasetNCHW = datasets::LargeWinogradOutputTransformDatasetNCHW(); + +const auto LargeWinogradOutputTransformDatasetNHWC = datasets::LargeWinogradOutputTransformDatasetNHWC(); } // namespace using namespace arm_compute::misc::shape_calculator; @@ -65,9 +120,6 @@ TEST_SUITE(CL) TEST_SUITE(Winograd) TEST_SUITE(InputTransform) - -// *INDENT-OFF* -// clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( framework::dataset::make("InputInfo",{ TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F16), // F16 not supported @@ -101,17 +153,20 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( { ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS); } -// clang-format on -// *INDENT-ON* using CLWinogradInputTransformFixture = WinogradInputTransformValidationFixture; -DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(SmallWinogradInputTransformDataset, LargeWinogradInputTransformDataset), +TEST_SUITE(NCHW) +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(SmallWinogradInputTransformDatasetNCHW, + LargeWinogradInputTransformDatasetNCHW), framework::dataset::make("DataLayout", { DataLayout::NCHW })), - framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("DataType", { DataType::F32 })), shape_in, winograd_info, data_layout, data_type) { - TensorShape shape_out = compute_winograd_input_transform_shape(TensorInfo(shape_in, 1, data_type), winograd_info); + TensorInfo tensor_info_in(shape_in, 1, data_type); + tensor_info_in.set_data_layout(data_layout); + + TensorShape shape_out = compute_winograd_input_transform_shape(tensor_info_in, winograd_info); // Create tensors CLTensor in = create_tensor(shape_in, data_type, 1, 0, QuantizationInfo(), data_layout); @@ -127,28 +182,70 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(frame winograd_input_transform.configure(&in, &out, winograd_info); } -FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixture, framework::DatasetMode::PRECOMMIT, combine(framework::dataset::concat(combine(SmallWinogradInputTransformDataset, - framework::dataset::make("DataLayout", { DataLayout::NCHW })), - combine(framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_3x3(), datasets::SmallWinogradInputTransformDataset4x4_5x5()), - framework::dataset::make("DataLayout", { DataLayout::NHWC }))), - framework::dataset::make("DataType", { DataType::F32 }))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixture, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNCHW, + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("DataType", { DataType::F32 }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixture, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNCHW, + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("DataType", { DataType::F32 }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // NCHW + +TEST_SUITE(NHWC) +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(SmallWinogradInputTransformDatasetNHWC, + LargeWinogradInputTransformDatasetNHWC), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("DataType", { DataType::F32 })), + shape_in, winograd_info, data_layout, data_type) +{ + TensorShape shape_in_nhwc(shape_in); + + // Convert the shape to NHWC + permute(shape_in_nhwc, PermutationVector(2U, 0U, 1U)); + + // TensorInfo + TensorInfo tensor_info_in(shape_in_nhwc, 1, data_type); + tensor_info_in.set_data_layout(data_layout); + + TensorShape shape_out = compute_winograd_input_transform_shape(tensor_info_in, winograd_info); + + // Create tensors + CLTensor in = create_tensor(shape_in_nhwc, data_type, 1, 0, QuantizationInfo(), data_layout); + CLTensor out = create_tensor(shape_out, data_type); + + ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + CLWinogradInputTransform winograd_input_transform; + + // Configure the function + winograd_input_transform.configure(&in, &out, winograd_info); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixture, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNHWC, + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("DataType", { DataType::F32 }))) { validate(CLAccessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixture, framework::DatasetMode::NIGHTLY, combine(framework::dataset::concat(combine(LargeWinogradInputTransformDataset, - framework::dataset::make("DataLayout", { DataLayout::NCHW })), - combine(framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_3x3(), datasets::LargeWinogradInputTransformDataset4x4_5x5()), - framework::dataset::make("DataLayout", { DataLayout::NHWC }))), - framework::dataset::make("DataType", { DataType::F32 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixture, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNHWC, + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("DataType", { DataType::F32 }))) { validate(CLAccessor(_target), _reference, tolerance_f32); } +TEST_SUITE_END() // NHWC TEST_SUITE_END() // InputTransform TEST_SUITE(FilterTransform) -// *INDENT-OFF* -// clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( framework::dataset::make("InputInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::F16), // F16 not supported @@ -182,19 +279,19 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( { ARM_COMPUTE_EXPECT(bool(CLWinogradFilterTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS); } -// clang-format on -// *INDENT-ON* using CLWinogradFilterTransform = CLSynthetizeFunctionWithZeroConstantBorder; using CLWinogradFilterTransformFixture = WinogradFilterTransformValidationFixture; -DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::concat(datasets::Small3x3Shapes(), datasets::Large3x3Shapes()), - framework::dataset::make("OutputTile", { Size2D(2U, 2U), Size2D(4U, 4U) })), - framework::dataset::make("DataLayout", { DataLayout::NCHW })), - framework::dataset::make("DataType", { DataType::F32 })), +TEST_SUITE(NCHW) +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, + combine(combine(framework::dataset::concat(SmallWinogradFilterTransformDatasetNCHW, + LargeWinogradFilterTransformDatasetNCHW), + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("DataType", { DataType::F32 })), shape_a, output_tile, data_layout, data_type) { - WinogradInfo winograd_info(output_tile, Size2D(shape_a[0], shape_a[1]), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW /* Not needed */); + WinogradInfo winograd_info(output_tile, Size2D(shape_a[0], shape_a[1]), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, data_layout /* Not needed */); TensorShape shape_b = compute_winograd_filter_transform_shape(TensorInfo(shape_a, 1, data_type), winograd_info); @@ -210,37 +307,79 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combi winograd_filter_transform.configure(&a, &b, winograd_info); } -FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::DatasetMode::ALL, - combine(framework::dataset::concat(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), - combine(datasets::Small3x3Shapes(), - framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - framework::dataset::make("DataLayout", { DataLayout::NCHW })), - combine(combine(framework::dataset::concat(datasets::Small3x3Shapes(), datasets::Small5x5Shapes()), framework::dataset::make("OutputTile", Size2D(4U, 4U))), framework::dataset::make("DataLayout", { DataLayout::NHWC }))), - framework::dataset::make("DataType", { DataType::F32 }))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(SmallWinogradFilterTransformDatasetNCHW, + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("DataType", { DataType::F32 }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::DatasetMode::NIGHTLY, - combine(framework::dataset::concat(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), - combine(datasets::Large3x3Shapes(), - framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - framework::dataset::make("DataLayout", { DataLayout::NCHW })), - combine(combine(framework::dataset::concat(datasets::Large3x3Shapes(), datasets::Large5x5Shapes()), framework::dataset::make("OutputTile", Size2D(4U, 4U))), framework::dataset::make("DataLayout", { DataLayout::NHWC }))), - framework::dataset::make("DataType", { DataType::F32 }))) + combine(combine(LargeWinogradFilterTransformDatasetNCHW, + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("DataType", { DataType::F32 }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } +TEST_SUITE_END() // NCHW + +TEST_SUITE(NHWC) +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, + combine(combine(framework::dataset::concat(SmallWinogradFilterTransformDatasetNHWC, + LargeWinogradFilterTransformDatasetNHWC), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("DataType", { DataType::F32 })), + shape_in, output_tile, data_layout, data_type) +{ + TensorShape shape_in_nhwc(shape_in); + + // Convert the shape to NHWC + permute(shape_in_nhwc, PermutationVector(2U, 0U, 1U)); + + // TensorInfo + TensorInfo tensor_info_in(shape_in_nhwc, 1, data_type); + tensor_info_in.set_data_layout(data_layout); + + WinogradInfo winograd_info(output_tile, Size2D(shape_in[0], shape_in[1]), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, data_layout /* Not needed */); + + TensorShape shape_b = compute_winograd_filter_transform_shape(tensor_info_in, winograd_info); + + // Create tensors + CLTensor a = create_tensor(shape_in_nhwc, data_type, 1, 0, QuantizationInfo(), data_layout); + CLTensor b = create_tensor(shape_b, data_type, 1, 0, QuantizationInfo(), data_layout); + + ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); + // Create and configure function + CLWinogradFilterTransform winograd_filter_transform; + winograd_filter_transform.configure(&a, &b, winograd_info); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(SmallWinogradFilterTransformDatasetNHWC, + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::DatasetMode::NIGHTLY, + combine(combine(LargeWinogradFilterTransformDatasetNHWC, + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // NHWC TEST_SUITE_END() // FilterTransform TEST_SUITE(OutputTransform) -// *INDENT-OFF* -// clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( framework::dataset::make("InputInfo",{ TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F16), // F16 not supported @@ -291,14 +430,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( { ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS); } -// clang-format on -// *INDENT-ON* using CLWinogradOutputTransform = CLSynthetizeFunctionWithZeroConstantBorder; using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture; -DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallWinogradOutputTransformDataset(), datasets::LargeWinogradOutputTransformDataset()), - framework::dataset::make("DataType", { DataType::F32 })), +TEST_SUITE(NCHW) +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(SmallWinogradOutputTransformDatasetNCHW, + LargeWinogradOutputTransformDatasetNCHW), + framework::dataset::make("DataType", { DataType::F32 })), shape_a, winograd_info, data_type) { TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), winograd_info); @@ -315,23 +454,62 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::da winograd_output_transform.configure(&a, nullptr, &b, winograd_info); } -FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, combine(datasets::SmallWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 }))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, + combine(SmallWinogradOutputTransformDatasetNCHW, + framework::dataset::make("DataType", { DataType::F32 }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, + combine(LargeWinogradOutputTransformDatasetNCHW, + framework::dataset::make("DataType", { DataType::F32 }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } +TEST_SUITE_END() // NCHW +TEST_SUITE(NHWC) +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(SmallWinogradOutputTransformDatasetNHWC, + LargeWinogradOutputTransformDatasetNHWC), + framework::dataset::make("DataType", { DataType::F32 })), + shape_a, winograd_info, data_type) +{ + TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), winograd_info); + + // Create tensors + CLTensor a = create_tensor(shape_a, data_type); + CLTensor b = create_tensor(shape_b, data_type, 1, 0, QuantizationInfo(), winograd_info.output_data_layout); + + ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + CLWinogradOutputTransform winograd_output_transform; + winograd_output_transform.configure(&a, nullptr, &b, winograd_info); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, + combine(SmallWinogradOutputTransformDatasetNHWC, + framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, + combine(LargeWinogradOutputTransformDatasetNHWC, + framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // NHWC TEST_SUITE_END() // OutputTransform TEST_SUITE(ConvolutionLayer) -// *INDENT-OFF* -// clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16), // FP16 not supported @@ -373,16 +551,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( { ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS); } -// clang-format on -// *INDENT-ON* using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture; TEST_SUITE(Conv3x3) FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(), framework::dataset::make("DataType", { DataType::F32 })), - framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), - framework::dataset::make("DataLayout", { DataLayout::NCHW }))) + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32); @@ -391,20 +567,64 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, fram FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(), framework::dataset::make("DataType", { DataType::F32 })), - framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), - framework::dataset::make("DataLayout", { DataLayout::NCHW }))) + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32); } TEST_SUITE_END() // Conv3x3 +TEST_SUITE(Conv3x1) +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32); +} +TEST_SUITE_END() // Conv3x1 + +TEST_SUITE(Conv1x3) +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32); +} +TEST_SUITE_END() // Conv1x3 + TEST_SUITE(Conv5x5) FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(), framework::dataset::make("DataType", { DataType::F32 })), - framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), - framework::dataset::make("DataLayout", { DataLayout::NCHW }))) + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) { // Validate output @@ -414,8 +634,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, fram FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(), framework::dataset::make("DataType", { DataType::F32 })), - framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), - framework::dataset::make("DataLayout", { DataLayout::NCHW }))) + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) { // Validate output @@ -424,7 +644,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, fram TEST_SUITE_END() // Conv5x5 TEST_SUITE_END() // ConvolutionLayer - TEST_SUITE_END() // Winograd TEST_SUITE_END() // CL } // namespace validation diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp index e2415a203e..ff69b1c4b6 100644 --- a/tests/validation/Helpers.cpp +++ b/tests/validation/Helpers.cpp @@ -215,7 +215,7 @@ void transpose_matrix(const SimpleTensor &in, SimpleTensor &out) template void get_tile(const SimpleTensor &in, SimpleTensor &tile, const Coordinates &coord) { - ARM_COMPUTE_ERROR_ON(tile.shape().num_dimensions() != 2); + ARM_COMPUTE_ERROR_ON(tile.shape().num_dimensions() > 2); const int w_tile = tile.shape()[0]; const int h_tile = tile.shape()[1]; @@ -272,7 +272,36 @@ void get_tile(const SimpleTensor &in, SimpleTensor &tile, const Coordinate } } +template +void zeros(SimpleTensor &in, const Coordinates &anchor, const TensorShape &shape) +{ + ARM_COMPUTE_ERROR_ON(anchor.num_dimensions() != shape.num_dimensions()); + ARM_COMPUTE_ERROR_ON(in.shape().num_dimensions() > 2); + ARM_COMPUTE_ERROR_ON(shape.num_dimensions() > 2); + + // Check if with the dimensions greater than 2 we could have out-of-bound reads + for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d) + { + if(anchor[d] < 0 || ((anchor[d] + shape[d]) > in.shape()[d])) + { + ARM_COMPUTE_ERROR("anchor[d] < 0 || (anchor[d] + shape[d]) > in.shape()[d]"); + } + } + + // Get input pointer + auto in_ptr = static_cast(in(anchor[0] + anchor[1] * in.shape()[0])); + + const unsigned int n = in.shape()[0]; + + for(unsigned int y = 0; y < shape[1]; ++y) + { + std::fill(in_ptr, in_ptr + shape[0], 0); + in_ptr += n; + } +} + template void get_tile(const SimpleTensor &in, SimpleTensor &roi, const Coordinates &coord); +template void zeros(SimpleTensor &in, const Coordinates &anchor, const TensorShape &shape); } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h index 49432d693e..88262d5e66 100644 --- a/tests/validation/Helpers.h +++ b/tests/validation/Helpers.h @@ -259,6 +259,15 @@ void transpose_matrix(const SimpleTensor &in, SimpleTensor &out); */ template void get_tile(const SimpleTensor &in, SimpleTensor &tile, const Coordinates &coord); + +/** Fill with zeros the input tensor in the area defined by anchor and shape + * + * @param[in] in Input tensor to fill with zeros + * @param[out] anchor Starting point of the zeros area + * @param[in] shape Ending point of the zeros area + */ +template +void zeros(SimpleTensor &in, const Coordinates &anchor, const TensorShape &shape); } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h index aca24f13ae..ac168ebe3c 100644 --- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h +++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h @@ -259,7 +259,18 @@ protected: fill(bias, 2, 0.f, 0.f); } - WinogradInfo winograd_info(Size2D(4U, 4U), + // Set output tile + Size2D output_tile(4U, 4U); + if(weights_shape[0] == 1) + { + output_tile.width = 1; + } + else if(weights_shape[1] == 1) + { + output_tile.height = 1; + } + + WinogradInfo winograd_info(output_tile, Size2D(weights_shape[0], weights_shape[1]), Size2D(input_shape[0], input_shape[1]), info, diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp index 197d218129..5be4fe274b 100644 --- a/tests/validation/reference/Winograd.cpp +++ b/tests/validation/reference/Winograd.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include +#include namespace arm_compute { @@ -142,12 +143,24 @@ void initialize_matrix_transform(SimpleTensor &src, const Size2D &output_tile { { WinogradKey(std::pair(2, 2), std::pair(3, 3), WinogradTransformType::INPUT), imatrix2x2_3x3 }, { WinogradKey(std::pair(4, 4), std::pair(3, 3), WinogradTransformType::INPUT), imatrix4x4_3x3 }, + { WinogradKey(std::pair(2, 1), std::pair(3, 1), WinogradTransformType::INPUT), imatrix2x2_3x3 }, + { WinogradKey(std::pair(4, 1), std::pair(3, 1), WinogradTransformType::INPUT), imatrix4x4_3x3 }, + { WinogradKey(std::pair(1, 2), std::pair(1, 3), WinogradTransformType::INPUT), imatrix2x2_3x3 }, + { WinogradKey(std::pair(1, 4), std::pair(1, 3), WinogradTransformType::INPUT), imatrix4x4_3x3 }, { WinogradKey(std::pair(4, 4), std::pair(5, 5), WinogradTransformType::INPUT), imatrix4x4_5x5 }, { WinogradKey(std::pair(2, 2), std::pair(3, 3), WinogradTransformType::FILTER), fmatrix2x2_3x3 }, { WinogradKey(std::pair(4, 4), std::pair(3, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 }, + { WinogradKey(std::pair(2, 1), std::pair(3, 1), WinogradTransformType::FILTER), fmatrix2x2_3x3 }, + { WinogradKey(std::pair(4, 1), std::pair(3, 1), WinogradTransformType::FILTER), fmatrix4x4_3x3 }, + { WinogradKey(std::pair(1, 2), std::pair(1, 3), WinogradTransformType::FILTER), fmatrix2x2_3x3 }, + { WinogradKey(std::pair(1, 4), std::pair(1, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 }, { WinogradKey(std::pair(4, 4), std::pair(5, 5), WinogradTransformType::FILTER), fmatrix4x4_5x5 }, { WinogradKey(std::pair(2, 2), std::pair(3, 3), WinogradTransformType::OUTPUT), omatrix2x2_3x3 }, { WinogradKey(std::pair(4, 4), std::pair(3, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 }, + { WinogradKey(std::pair(2, 1), std::pair(3, 1), WinogradTransformType::OUTPUT), omatrix2x2_3x3 }, + { WinogradKey(std::pair(4, 1), std::pair(3, 1), WinogradTransformType::OUTPUT), omatrix4x4_3x3 }, + { WinogradKey(std::pair(1, 2), std::pair(1, 3), WinogradTransformType::OUTPUT), omatrix2x2_3x3 }, + { WinogradKey(std::pair(1, 4), std::pair(1, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 }, { WinogradKey(std::pair(4, 4), std::pair(5, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 }, }; @@ -174,6 +187,20 @@ void initialize_matrix_transform(SimpleTensor &src, const Size2D &output_tile } } // namespace +template +void print_tile(SimpleTensor &in) +{ + for(int y = 0; y < in.shape()[1]; y++) + { + for(int x = 0; x < in.shape()[0]; x++) + { + std::cout << in[x + y * in.shape()[0]] << " "; + } + + std::cout << std::endl; + } +} + template SimpleTensor winograd_input_transform(const SimpleTensor &in, const TensorShape &output_shape, const WinogradInfo &winograd_info) { @@ -189,7 +216,10 @@ SimpleTensor winograd_input_transform(const SimpleTensor &in, const Tensor const unsigned int tile_w = output_tile_size.width + kernel_size.width - 1; const unsigned int tile_h = output_tile_size.height + kernel_size.height - 1; - TensorShape tile_dims(tile_w, tile_h); + // Get the maximum dimension from the tile size + const unsigned int tile_max_dim = std::max(tile_w, tile_h); + + TensorShape tile_dims(tile_max_dim, tile_max_dim); // Simple tensor for the input tile SimpleTensor src_tile{ tile_dims, in.data_type() }; @@ -217,11 +247,46 @@ SimpleTensor winograd_input_transform(const SimpleTensor &in, const Tensor const int in_d = in.shape().z(); const int out_d = out.shape().z(); const int num_batches = in.shape().total_size() / (in_w * in_h * in_d); - const int num_tiles_x = std::ceil((in_w - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast(output_tile_size.width)); - const int num_tiles_y = std::ceil((in_h - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast(output_tile_size.height)); const int step_x = output_tile_size.width; const int step_y = output_tile_size.height; + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(in_w, in_h), + kernel_size, + output_tile_size, + conv_info); + + const int num_tiles_x = num_tiles.width; + const int num_tiles_y = num_tiles.height; + + // In case of 1D convolution, the input tile has to be partially filled with zeros + int start_x_zero = 0; + int start_y_zero = 0; + int end_x_zero = 0; + int end_y_zero = 0; + + if(output_tile_size.width == 1) + { + start_x_zero = 1; + start_y_zero = 0; + end_x_zero = tile_max_dim - 1; + end_y_zero = tile_max_dim; + } + else if(output_tile_size.height == 1) + { + start_x_zero = 0; + start_y_zero = 1; + end_x_zero = tile_max_dim; + end_y_zero = tile_max_dim - 1; + } + + // Set the anchor and shape of the zeros area + const Coordinates anchor_zeros(start_x_zero, start_y_zero); + const TensorShape shape_zeros(end_x_zero, end_y_zero); + + // If we have a vertical filter (i.e. 1x3, 1x5,..), we need to take the elements along the y direction (step = width of the output tile) + const int step_y_transf_tile = kernel_size.width == 1 ? tile_max_dim : 1; + ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast(out.shape().y())); for(int b = 0; b < num_batches; ++b) @@ -238,6 +303,9 @@ SimpleTensor winograd_input_transform(const SimpleTensor &in, const Tensor // Get the tile from the input tensor get_tile(in, src_tile, Coordinates(xi, yi, z, b)); + // Fill partially with zeros in case of 1D convolution + zeros(src_tile, anchor_zeros, shape_zeros); + // Compute the transformation matrix_multiply(matrix, src_tile, tmp_tile); matrix_multiply(tmp_tile, matrix_transposed, dst_tile); @@ -247,7 +315,7 @@ SimpleTensor winograd_input_transform(const SimpleTensor &in, const Tensor { int xo = z; int yo = x + y * num_tiles_x; - out[coords2index(out.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i]; + out[coords2index(out.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i * step_y_transf_tile]; } } } @@ -268,27 +336,31 @@ SimpleTensor winograd_filter_transform(const SimpleTensor &in, const Tenso const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; - TensorShape kernel_tile_dims(kernel_size.width, kernel_size.height); - // Calculate dimensions for the tile const unsigned int input_tile_w = output_tile_size.width + kernel_size.width - 1; const unsigned int input_tile_h = output_tile_size.height + kernel_size.height - 1; const unsigned int input_tile_area = input_tile_w * input_tile_h; + // Get the maximum dimension from the filter size + const unsigned int kernel_max_dim = std::max(kernel_size.width, kernel_size.height); + + // Get the maximum dimension from the input tile + const unsigned int input_tile_max_dim = std::max(input_tile_w, input_tile_h); + // Simple tensor for the input tile - SimpleTensor input_tile{ kernel_tile_dims, in.data_type(), 1 }; + SimpleTensor input_tile{ TensorShape(kernel_max_dim, kernel_max_dim), in.data_type(), 1 }; // Simple tensor for the transformation matrix - SimpleTensor trans_matrix{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 }; + SimpleTensor trans_matrix{ TensorShape(kernel_max_dim, input_tile_max_dim), in.data_type(), 1 }; // Simple tensor for the transformation matrix transpose - SimpleTensor trans_matrix_transposed{ TensorShape(input_tile_w, kernel_tile_dims[0]), in.data_type(), 1 }; + SimpleTensor trans_matrix_transposed{ TensorShape(input_tile_max_dim, kernel_max_dim), in.data_type(), 1 }; // Simple tensor for the temporary tile - SimpleTensor tmp_tile{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 }; + SimpleTensor tmp_tile{ TensorShape(kernel_max_dim, input_tile_max_dim), in.data_type(), 1 }; // Simple tensor for the output tile - SimpleTensor transf_tile{ TensorShape(input_tile_w, input_tile_w), in.data_type(), 1 }; + SimpleTensor transf_tile{ TensorShape(input_tile_max_dim, input_tile_max_dim), in.data_type(), 1 }; // Initialize matrix for the filter transform initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::FILTER); @@ -300,6 +372,9 @@ SimpleTensor winograd_filter_transform(const SimpleTensor &in, const Tenso const int num_filters = in.shape()[3]; const int num_batches = in.shape().total_size() / (kernel_size.area() * num_channels * num_filters); + // If we have a vertical filter (i.e. 1x3, 1x5,..), we need to take the elements along the y direction (step_y_transf_tile = width of the output tile) + const int step_y_transf_tile = kernel_size.width == 1 ? input_tile_max_dim : 1; + for(int n = 0; n < num_batches; ++n) { for(int w = 0; w < num_filters; ++w) @@ -321,7 +396,7 @@ SimpleTensor winograd_filter_transform(const SimpleTensor &in, const Tenso // Store the values across the channels for(unsigned int i = 0; i < input_tile_area; ++i) { - out[output_offset + i * num_filters * num_channels] = transf_tile[i]; + out[output_offset + i * num_filters * num_channels] = transf_tile[i * step_y_transf_tile]; } } } @@ -350,15 +425,19 @@ SimpleTensor winograd_output_transform(const SimpleTensor &in, const Simpl ARM_COMPUTE_ERROR_ON(in.shape()[2] != (in_tile_w * in_tile_h)); ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::CHANNEL)]); + // Get the maximum dimension from the tile size + const unsigned int in_tile_max_dim = std::max(in_tile_w, in_tile_h); + const unsigned int out_tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); + // Compute tile dimensions // Input tile dimensions - TensorShape in_tile_dims(in_tile_w, in_tile_h); + TensorShape in_tile_dims(in_tile_max_dim, in_tile_max_dim); // Output tile dimensions - TensorShape out_tile_dims(output_tile_size.width, output_tile_size.height); + TensorShape out_tile_dims(out_tile_max_dim, out_tile_max_dim); // Transformation matrix dimensions - TensorShape tr_tile_dims(in_tile_w, output_tile_size.width); + TensorShape tr_tile_dims(in_tile_max_dim, out_tile_max_dim); // Create tensors // Simple tensor for the input tile @@ -400,15 +479,24 @@ SimpleTensor winograd_output_transform(const SimpleTensor &in, const Simpl const int stridez_out = stridey_out * h_out; const int stridew_out = stridez_out * c_out; - // Compute number of elements to process in the X and Y direction - const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right(); - const int num_elements_y = input_dimensions.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom(); - const int num_tiles_x = std::ceil(num_elements_x / static_cast(output_tile_size.width)); - const int num_tiles_y = std::ceil(num_elements_y / static_cast(output_tile_size.height)); + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input_dimensions.width, input_dimensions.height), + kernel_size, + output_tile_size, + conv_info); + + const int num_tiles_x = num_tiles.width; + const int num_tiles_y = num_tiles.height; ARM_COMPUTE_UNUSED(num_tiles_y); ARM_COMPUTE_ERROR_ON(in.shape()[1] != static_cast(num_tiles_x * num_tiles_y)); + // If we have a vertical filter (i.e. 1x3, 1x5,..), we still need to take the elements along the x direction (step_y_transf_tile = 1) + const int step_y_transf_tile = kernel_size.width == 1 ? 1 : output_tile.shape()[0]; + + // Initialize with zeros the input tile + zeros(input_tile, Coordinates(0, 0), input_tile.shape()); + for(int n = 0; n < num_batches; ++n) { for(int y = 0; y < h_in; ++y) @@ -441,7 +529,7 @@ SimpleTensor winograd_output_transform(const SimpleTensor &in, const Simpl // Check out-of-bound writes if((xo + xi < w_out) && (yo + yi < h_out)) { - out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * out_tile_w]; + out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * step_y_transf_tile]; // Add bias out[output_offset + yi * stridey_out + xi] += b[zo]; -- cgit v1.2.1