From dcb5b284300c34d5984091cfe99559cc420e59bb Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Wed, 25 Apr 2018 12:07:29 +0100 Subject: COMPMID-1048 Add NHWC data format support to Winograd filter transform 4x4_3x3 Change-Id: Ifd125fcb5451dbac3c28b15a9471048a74fee0ad Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/128987 Reviewed-by: Gian Marco Iodice Tested-by: Jenkins --- .../CL/kernels/CLWinogradFilterTransformKernel.h | 6 +- src/core/CL/CLKernelLibrary.cpp | 1 + src/core/CL/cl_kernels/winograd.cl | 177 +++++++++++++++++++-- .../CL/kernels/CLWinogradFilterTransformKernel.cpp | 15 +- src/core/Helpers.cpp | 7 + tests/validation/CL/Winograd.cpp | 20 ++- .../fixtures/WinogradConvolutionLayerFixture.h | 11 +- 7 files changed, 202 insertions(+), 35 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h index 828e2e521a..7360646019 100644 --- a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h +++ b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h @@ -51,8 +51,9 @@ public: * @note Winograd filter transform supports the following configurations: * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5) * Strides: only unit strides + * Data Layout: NCHW for all configurations, NHWC for F(4x4, 3x3) * - * @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout). Data types supported: F32. + * @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F32. * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo */ @@ -62,8 +63,9 @@ public: * @note Winograd filter transform supports the following configurations: * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5) * Strides: only unit strides + * Data Layout: NCHW for all configurations, NHWC for F(4x4, 3x3) * - * @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout). Data types supported: F32. + * @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F32. * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo * diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index f74c6c8a4a..009d4db535 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -369,6 +369,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "winograd_filter_transform_2x2_3x3_nchw", "winograd.cl" }, { "winograd_filter_transform_4x4_3x3_nchw", "winograd.cl" }, { "winograd_filter_transform_4x4_5x5_nchw", "winograd.cl" }, + { "winograd_filter_transform_4x4_3x3_nhwc", "winograd.cl" }, { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl" }, { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" }, { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" }, diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl index 14bebb4b0b..6a570277ab 100644 --- a/src/core/CL/cl_kernels/winograd.cl +++ b/src/core/CL/cl_kernels/winograd.cl @@ -23,11 +23,11 @@ */ #include "helpers.h" -#if defined(NUM_CHANNELS) +#if defined(SRC_DIM_Z) /** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2 * - * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64 + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -52,7 +52,7 @@ __kernel void winograd_filter_transform_2x2_3x3_nchw( TENSOR4D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) { - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS); + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); @@ -92,8 +92,8 @@ __kernel void winograd_filter_transform_2x2_3x3_nchw( out3.s3 = (w2.s2); int z = get_global_id(2); - int x0 = z / NUM_CHANNELS; // idx filter - int y0 = z % NUM_CHANNELS; // idx channel + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel // Get output address __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; @@ -119,7 +119,7 @@ __kernel void winograd_filter_transform_2x2_3x3_nchw( /** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 4x4 * - * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64 + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -144,7 +144,7 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw( TENSOR4D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) { - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS); + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); @@ -210,8 +210,8 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw( out5.s5 = (w2.s2); int z = get_global_id(2); - int x0 = z / NUM_CHANNELS; // idx filter - int y0 = z % NUM_CHANNELS; // idx channel + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel // Get output address __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; @@ -255,9 +255,158 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw( *(__global float *)(dst_addr + 35 * dst_stride_z) = out5.s5; } +/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NHWC and the output tile is 4x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x4_3x3_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w; + + // Load the values from the input tensor + float w00 = *((__global float *)(src_addr + 0 * src_stride_z + 0 * src_stride_y)); + float w01 = *((__global float *)(src_addr + 0 * src_stride_z + 1 * src_stride_y)); + float w02 = *((__global float *)(src_addr + 0 * src_stride_z + 2 * src_stride_y)); + float w10 = *((__global float *)(src_addr + 1 * src_stride_z + 0 * src_stride_y)); + float w11 = *((__global float *)(src_addr + 1 * src_stride_z + 1 * src_stride_y)); + float w12 = *((__global float *)(src_addr + 1 * src_stride_z + 2 * src_stride_y)); + float w20 = *((__global float *)(src_addr + 2 * src_stride_z + 0 * src_stride_y)); + float w21 = *((__global float *)(src_addr + 2 * src_stride_z + 1 * src_stride_y)); + float w22 = *((__global float *)(src_addr + 2 * src_stride_z + 2 * src_stride_y)); + + // Transform the 3x3 tile in a 6x6 tile + float out00, out01, out02, out03, out04, out05; + float out10, out11, out12, out13, out14, out15; + float out20, out21, out22, out23, out24, out25; + float out30, out31, out32, out33, out34, out35; + float out40, out41, out42, out43, out44, out45; + float out50, out51, out52, out53, out54, out55; + + out00 = out01 = out02 = out03 = out04 = out05 = 0.f; + out10 = out11 = out12 = out13 = out14 = out15 = 0.f; + out20 = out21 = out22 = out23 = out24 = out25 = 0.f; + out30 = out31 = out32 = out33 = out34 = out35 = 0.f; + out40 = out41 = out42 = out43 = out44 = out45 = 0.f; + out50 = out51 = out52 = out53 = out54 = out55 = 0.f; + + // Row 0 + out00 = (w00) / 16.f; + out01 = (-w00 - w01 - w02) / 24.f; + out02 = (-w00 + w01 - w02) / 24.f; + out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f; + out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f; + out05 = (w02) / 4.f; + + // Row 1 + out10 = (-w00 - w10 - w20) / 24.f; + out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f; + out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f; + out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f; + out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f; + out15 = (-w02 - w12 - w22) / 6.f; + + // Row 2 + out20 = (-w00 + w10 - w20) / 24.f; + out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f; + out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f; + out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f; + out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f; + out25 = (-w02 + w12 - w22) / 6.f; + + // Row 3 + out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f; + out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f; + out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f; + out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f; + out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f; + out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f; + + // Row 4 + out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f; + out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f; + out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f; + out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f; + out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f; + out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f; + + // Row 5 + out50 = (w20) / 4.f; + out51 = (-w20 - w21 - w22) / 6.f; + out52 = (-w20 + w21 - w22) / 6.f; + out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f; + out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f; + out55 = (w22); + + int x0 = get_global_id(2); // idx filter + int y0 = get_global_id(0); // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; + + // Store the values across the channels + *(__global float *)(dst_addr + 0 * dst_stride_z) = out00; + *(__global float *)(dst_addr + 1 * dst_stride_z) = out01; + *(__global float *)(dst_addr + 2 * dst_stride_z) = out02; + *(__global float *)(dst_addr + 3 * dst_stride_z) = out03; + *(__global float *)(dst_addr + 4 * dst_stride_z) = out04; + *(__global float *)(dst_addr + 5 * dst_stride_z) = out05; + *(__global float *)(dst_addr + 6 * dst_stride_z) = out10; + *(__global float *)(dst_addr + 7 * dst_stride_z) = out11; + *(__global float *)(dst_addr + 8 * dst_stride_z) = out12; + *(__global float *)(dst_addr + 9 * dst_stride_z) = out13; + *(__global float *)(dst_addr + 10 * dst_stride_z) = out14; + *(__global float *)(dst_addr + 11 * dst_stride_z) = out15; + *(__global float *)(dst_addr + 12 * dst_stride_z) = out20; + *(__global float *)(dst_addr + 13 * dst_stride_z) = out21; + *(__global float *)(dst_addr + 14 * dst_stride_z) = out22; + *(__global float *)(dst_addr + 15 * dst_stride_z) = out23; + *(__global float *)(dst_addr + 16 * dst_stride_z) = out24; + *(__global float *)(dst_addr + 17 * dst_stride_z) = out25; + *(__global float *)(dst_addr + 18 * dst_stride_z) = out30; + *(__global float *)(dst_addr + 19 * dst_stride_z) = out31; + *(__global float *)(dst_addr + 20 * dst_stride_z) = out32; + *(__global float *)(dst_addr + 21 * dst_stride_z) = out33; + *(__global float *)(dst_addr + 22 * dst_stride_z) = out34; + *(__global float *)(dst_addr + 23 * dst_stride_z) = out35; + *(__global float *)(dst_addr + 24 * dst_stride_z) = out40; + *(__global float *)(dst_addr + 25 * dst_stride_z) = out41; + *(__global float *)(dst_addr + 26 * dst_stride_z) = out42; + *(__global float *)(dst_addr + 27 * dst_stride_z) = out43; + *(__global float *)(dst_addr + 28 * dst_stride_z) = out44; + *(__global float *)(dst_addr + 29 * dst_stride_z) = out45; + *(__global float *)(dst_addr + 30 * dst_stride_z) = out50; + *(__global float *)(dst_addr + 31 * dst_stride_z) = out51; + *(__global float *)(dst_addr + 32 * dst_stride_z) = out52; + *(__global float *)(dst_addr + 33 * dst_stride_z) = out53; + *(__global float *)(dst_addr + 34 * dst_stride_z) = out54; + *(__global float *)(dst_addr + 35 * dst_stride_z) = out55; +} /** This OpenCL kernel performs Winograd filter transform 5x5 when the data format is NCHW and the output tile is 4x4 * - * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64 + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -282,7 +431,7 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw( TENSOR4D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) { - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS); + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); @@ -452,8 +601,8 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw( out7.s7 = w41; int z = get_global_id(2); - int x0 = z / NUM_CHANNELS; // idx filter - int y0 = z % NUM_CHANNELS; // idx channel + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel // Get output address __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; @@ -524,7 +673,7 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw( *(__global float *)(dst_addr + 62 * dst_stride_z) = out7.s6; *(__global float *)(dst_addr + 63 * dst_stride_z) = out7.s7; } -#endif // defined(NUM_CHANNELS) +#endif // defined(SRC_DIM_Z) #if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) /** This OpenCL kernel computes the input transform when the kernel size is 3x3 and the output tile is 2x2 diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp index 41b3ac50b5..cf4d73fbc1 100644 --- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp @@ -47,7 +47,6 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); const Size2D kernel_size = winograd_info.kernel_size; const Size2D output_tile_size = winograd_info.output_tile_size; @@ -56,6 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels"); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && (output_tile_size != Size2D(4U, 4U) || kernel_size != Size2D(3U, 3U))); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels"); @@ -79,10 +79,11 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - const unsigned int num_elems_processed_per_iteration_x = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)); - const unsigned int num_elems_processed_per_iteration_y = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)); + const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1; + const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); + const unsigned int num_elems_read_per_iteration_z = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z)); bool window_changed = false; AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); @@ -111,17 +112,15 @@ void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTenso ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info)); - const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); - // Set build options CLBuildOptions build_opts; - build_opts.add_option("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(idx_c))); + build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); const Size2D kernel_size = winograd_info.kernel_size; const Size2D output_tile_size = winograd_info.output_tile_size; // Create kernel - std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_nchw"; + std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(input->info()->data_layout())); _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); _input = input; diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp index e336331663..c0af3bb379 100644 --- a/src/core/Helpers.cpp +++ b/src/core/Helpers.cpp @@ -59,6 +59,13 @@ Window arm_compute::calculate_max_window(const ValidRegion &valid_region, const ++n; } + if(anchor.num_dimensions() > 2) + { + window.set(2, Window::Dimension(anchor[2], std::max(1, shape[2]), steps[2])); + + ++n; + } + for(; n < anchor.num_dimensions(); ++n) { window.set(n, Window::Dimension(anchor[n], std::max(1, shape[n]))); diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp index a61dd3f8f4..17dd58ffdc 100644 --- a/tests/validation/CL/Winograd.cpp +++ b/tests/validation/CL/Winograd.cpp @@ -203,10 +203,12 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combi } FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::DatasetMode::ALL, - combine(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), combine(datasets::Small3x3Shapes(), - framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - framework::dataset::make("DataLayout", { DataLayout::NCHW })), + combine(framework::dataset::concat(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), + combine(datasets::Small3x3Shapes(), + framework::dataset::make("OutputTile", Size2D(4U, 4U)))), + combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))), + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + combine(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U))), framework::dataset::make("DataLayout", { DataLayout::NHWC }))), framework::dataset::make("DataType", { DataType::F32 }))) { // Validate output @@ -214,10 +216,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::Da } FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::DatasetMode::NIGHTLY, - combine(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), combine(datasets::Large3x3Shapes(), - framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))), - framework::dataset::make("DataLayout", { DataLayout::NCHW })), + combine(framework::dataset::concat(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), + combine(datasets::Large3x3Shapes(), + framework::dataset::make("OutputTile", Size2D(4U, 4U)))), + combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))), + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + combine(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U))), framework::dataset::make("DataLayout", { DataLayout::NHWC }))), framework::dataset::make("DataType", { DataType::F32 }))) { // Validate output diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h index e23368add6..6381b99131 100644 --- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h +++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h @@ -391,11 +391,16 @@ protected: } } - TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataLayout data_layout, DataType data_type) + TensorType compute_target(TensorShape input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataLayout data_layout, DataType data_type) { + if(data_layout == DataLayout::NHWC) + { + permute(input_shape, PermutationVector(2U, 0U, 1U)); + } + // Create tensors TensorType src = create_tensor(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout); - TensorType dst = create_tensor(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout); + TensorType dst = create_tensor(output_shape, data_type, 1, 0, QuantizationInfo()); // Create and configure function FunctionType filter_transform; @@ -422,7 +427,7 @@ protected: SimpleTensor compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataLayout data_layout, DataType data_type) { // Create reference - SimpleTensor src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout }; + SimpleTensor src{ input_shape, data_type, 1, 0, QuantizationInfo() }; // Fill reference fill(src, 0, -1.f, 1.f); -- cgit v1.2.1