From 0f170396e84836ad8c54d54421e95c61812968be Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Wed, 18 Jul 2018 16:13:12 +0100 Subject: COMPMID-1342 Add grouping support to CLIm2ColKernel Change-Id: I4afb19751520a90fee27fb49b775cd10e92a94f5 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140476 Reviewed-by: Gian Marco Iodice Tested-by: Jenkins --- arm_compute/core/CL/kernels/CLIm2ColKernel.h | 9 +- arm_compute/core/NEON/kernels/NEIm2ColKernel.h | 12 +- arm_compute/core/utils/misc/ShapeCalculator.h | 14 +- arm_compute/runtime/NEON/functions/NEIm2Col.h | 14 +- src/core/CL/cl_kernels/im2col.cl | 144 +++++++++++++++++++-- src/core/CL/kernels/CLIm2ColKernel.cpp | 56 +++++--- src/core/NEON/kernels/NEIm2ColKernel.cpp | 12 +- src/runtime/CL/functions/CLGEMM.cpp | 6 +- .../CL/functions/CLGEMMConvolutionLayer.cpp | 2 +- src/runtime/NEON/functions/NEFlattenLayer.cpp | 2 +- .../NEON/functions/NEFullyConnectedLayer.cpp | 4 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 4 +- src/runtime/NEON/functions/NEIm2Col.cpp | 10 +- .../NEON/functions/NELocallyConnectedLayer.cpp | 2 +- tests/datasets/ShapeDatasets.h | 33 +++++ tests/validation/CL/Im2Col.cpp | 80 ++++++++++-- tests/validation/NEON/Im2Col.cpp | 9 +- tests/validation/fixtures/Im2ColFixture.h | 10 +- tests/validation/reference/Im2Col.cpp | 45 ++++--- tests/validation/reference/Im2Col.h | 3 +- 20 files changed, 368 insertions(+), 103 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h index ae19319047..c678f277cb 100644 --- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h +++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h @@ -76,8 +76,10 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution */ - void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U)); + void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], @@ -88,10 +90,12 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -103,6 +107,7 @@ public: unsigned int _num_elems_processed_per_iteration; Size2D _kernel_dims; PadStrideInfo _conv_info; + unsigned int _num_groups; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h index 19da7cfd53..37145a38c1 100644 --- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h +++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h @@ -83,12 +83,13 @@ public: * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution * @param[in] is_fully_connected (Optional) Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments * @param[in] is_flatten (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, bool is_fully_connected = false, bool is_flatten = false, const Size2D &dilation = Size2D(1U, 1U)); + bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false); /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], @@ -98,14 +99,15 @@ public: * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. - * @param[in] is_fully_connected (Optional) Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments - * @param[in] is_flatten (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution + * @param[in] is_fully_connected (Optional)Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments + * @param[in] is_flatten (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments * * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, bool is_fully_connected = false, bool is_flatten = false, const Size2D &dilation = Size2D(1U, 1U)); + bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index f726ce9ad3..fc6abf95f3 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -192,9 +192,15 @@ inline TensorShape compute_deconvolution_shape(const ITensorInfo &input, unsigne return scale_out_shape; } -inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z) +inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z, + unsigned int num_groups = 1) { - // The output shape will be the 2D shape used as input for GEMM [ out_channels * kernel_area, num_elems_per_out_channel ] + // The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ] if batch_size_on_z == true + // or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ] if batch_size_on_z == false + + ARM_COMPUTE_ERROR_ON(num_groups == 0); + ARM_COMPUTE_ERROR_ON(num_groups > 1 && input->data_layout() != DataLayout::NCHW); + ARM_COMPUTE_ERROR_ON(num_groups > 1 && batch_size_on_z); TensorShape output_shape{ input->tensor_shape() }; @@ -204,7 +210,7 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); std::pair out_dims = scaled_dimensions(output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation); - output_shape.set(0, (output_shape[channel_idx] * kernel_dims.area() + (has_bias ? 1 : 0))); + output_shape.set(0, (output_shape[channel_idx] / num_groups * kernel_dims.area() + (has_bias ? 1 : 0))); // NOLINT output_shape.set(1, (out_dims.first * out_dims.second)); if(batch_size_on_z && output_shape.num_dimensions() >= 3) { @@ -212,7 +218,7 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz } else { - output_shape.set(2, 1); + output_shape.set(2, num_groups); } return output_shape; diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h index d888b7e8f5..9df4f070d8 100644 --- a/arm_compute/runtime/NEON/functions/NEIm2Col.h +++ b/arm_compute/runtime/NEON/functions/NEIm2Col.h @@ -49,10 +49,13 @@ public: * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution * @param[in] is_fully_connected (Optional) Determines whether this function will be called by @ref NEFullyConnectedLayer in order to validate the arguments * @param[in] is_flatten (Optional) Determines whether this function will be called by @ref NEFlattenLayer in order to validate the arguments */ - void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected = false, bool is_flatten = false); + void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false); /** Static function to check if given info will lead to a valid configuration of @ref NEIm2Col * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], @@ -62,12 +65,15 @@ public: * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. - * @param[in] is_fully_connected Determines whether this function will be called by @ref NEFullyConnectedLayer in order to validate the arguments - * @param[in] is_flatten Determines whether this function will be called by @ref NEFlattenLayer in order to validate the arguments + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution + * @param[in] is_fully_connected (Optional) Determines whether this function will be called by @ref NEFullyConnectedLayer in order to validate the arguments + * @param[in] is_flatten (Optional) Determines whether this function will be called by @ref NEFlattenLayer in order to validate the arguments * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false); // Inherited methods overridden: void run() override; diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl index 5db1d6ce33..186d5a80ad 100644 --- a/src/core/CL/cl_kernels/im2col.cl +++ b/src/core/CL/cl_kernels/im2col.cl @@ -43,6 +43,7 @@ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -57,13 +58,19 @@ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col1x1_stridex1_nchw( TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) uint src_stride_w, uint dst_stride_w) { @@ -86,13 +93,22 @@ __kernel void im2col1x1_stridex1_nchw( const uint yi = yc * STRIDE_Y; // Calculate output indices - const uint xo = ch; + +#if defined(NUM_GROUPS) + const uint xo = ch % (SRC_DEPTH / NUM_GROUPS); + const uint zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const uint xo = ch; +#endif // defined(NUM_GROUPS) const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution // Get input and output address __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; - +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) VEC_DATA_TYPE(DATA_TYPE, 4) data = vload4(0, (__global DATA_TYPE *)input_ptr); @@ -106,7 +122,11 @@ __kernel void im2col1x1_stridex1_nchw( *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3; #ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if(xo == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) { *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f; *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f; @@ -130,6 +150,7 @@ __kernel void im2col1x1_stridex1_nchw( * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -144,13 +165,19 @@ __kernel void im2col1x1_stridex1_nchw( * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col_generic_nchw( TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) uint src_stride_w, uint dst_stride_w) { @@ -164,11 +191,20 @@ __kernel void im2col_generic_nchw( const int yi = yc * STRIDE_Y - PAD_TOP; // Calculate output indices - const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; +#endif // defined(NUM_GROUPS) const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; +#else // defined(NUM_GROUPS) __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; +#endif // defined(NUM_GROUPS) // Linearize convolution elements for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) @@ -193,7 +229,11 @@ __kernel void im2col_generic_nchw( } #ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) { *output_ptr = 1.0f; } @@ -225,13 +265,19 @@ __kernel void im2col_generic_nchw( * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col3x3_nchw( TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) uint src_stride_w, uint dst_stride_w) { @@ -245,13 +291,21 @@ __kernel void im2col3x3_nchw( const int yi = yc * STRIDE_Y - PAD_TOP; // Calculate output indices - const int xo = ch * 9; // 3x3 +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3 + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * 9; // 3x3 +#endif // defined(NUM_GROUPS) const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution // Get input and output address __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; - +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) VEC_DATA_TYPE(DATA_TYPE, 3) row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); @@ -281,7 +335,11 @@ __kernel void im2col3x3_nchw( *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; #ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) { *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; } @@ -298,6 +356,7 @@ __kernel void im2col3x3_nchw( * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -312,13 +371,19 @@ __kernel void im2col3x3_nchw( * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col5x5_nchw( TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) uint src_stride_w, uint dst_stride_w) { @@ -332,7 +397,12 @@ __kernel void im2col5x5_nchw( const int yi = yc * STRIDE_Y - PAD_TOP; // Calculate output indices - const int xo = ch * 25; // 5x5 +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5 + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * 25; // 5x5 +#endif // defined(NUM_GROUPS) const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 @@ -353,8 +423,11 @@ __kernel void im2col5x5_nchw( // Get input and output address __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; - +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) { VEC_DATA_TYPE(DATA_TYPE, 4) @@ -455,7 +528,11 @@ __kernel void im2col5x5_nchw( } #ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) { *((__global DATA_TYPE *)output_ptr) = 1.0f; } @@ -471,6 +548,7 @@ __kernel void im2col5x5_nchw( * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -485,13 +563,19 @@ __kernel void im2col5x5_nchw( * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col11x11_padx0_pady0_nchw( TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) uint src_stride_w, uint dst_stride_w) { @@ -505,13 +589,22 @@ __kernel void im2col11x11_padx0_pady0_nchw( const int yi = yc * STRIDE_Y; // Calculate output indices - const int xo = ch * 121; // 11x11 +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11 + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * 121; // 11x11 +#endif // defined(NUM_GROUPS) const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution // Get input and output address __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; - +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) + { VEC_DATA_TYPE(DATA_TYPE, 8) row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); @@ -655,7 +748,11 @@ __kernel void im2col11x11_padx0_pady0_nchw( } #ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) { *((__global DATA_TYPE *)output_ptr) = 1.0f; } @@ -671,6 +768,7 @@ __kernel void im2col11x11_padx0_pady0_nchw( * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3. * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -685,13 +783,19 @@ __kernel void im2col11x11_padx0_pady0_nchw( * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col_generic_padx0_pady0_nchw( TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) uint src_stride_w, uint dst_stride_w) { @@ -703,11 +807,23 @@ __kernel void im2col_generic_padx0_pady0_nchw( // Calculate input indices const int xi = xc * STRIDE_X; const int yi = yc * STRIDE_Y; + // Calculate output indices +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; - const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; +#endif // defined(NUM_GROUPS) + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; +#else // defined(NUM_GROUPS) __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; +#endif // defined(NUM_GROUPS) + // Linearize convolution elements for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y) { @@ -734,7 +850,11 @@ __kernel void im2col_generic_padx0_pady0_nchw( } /* End of loop over KERNEL_HEIGHT */ #ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) { *output_ptr = 1.0f; } diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index 42bb96c16f..3d71567eee 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -54,18 +54,24 @@ struct Im2ColConfiguration bool is_padding_required_nchw{}; }; -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_groups) { + const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1); + ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0); if(output->total_size() > 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true)); + const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -74,12 +80,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw) + unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true); + TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups); auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape)); @@ -141,7 +147,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen return std::make_pair(err, win); } -Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) +Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups) { const DataLayout data_layout = input->data_layout(); const DataType data_type = input->data_type(); @@ -177,6 +183,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel)); build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); + build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->quantization_info().offset), "-DPAD_VALUE=0"); build_opts.add_option_if(has_bias, "-DHAS_BIAS"); @@ -274,14 +281,15 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size } // namespace CLIm2ColKernel::CLIm2ColKernel() - : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info() + : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups() { } -void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) +void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups)); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -292,7 +300,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const // Select and configure the optimal OpenCL kernel to run. // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration // and the padding requirement flag - Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation); + Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation, num_groups); // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel(im2col_config.kernel_name, im2col_config.build_options)); @@ -303,10 +311,11 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration; _kernel_dims = kernel_dims; // Only needed by the Tuner _conv_info = conv_info; // Only needed by the Tuner + _num_groups = num_groups; // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, - im2col_config.is_padding_required_nchw); + im2col_config.is_padding_required_nchw, num_groups); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -322,12 +331,13 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const _config_id += lower_string(string_from_data_layout(input->info()->data_layout())); } -Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) +Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_groups) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation)); - Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups)); + Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation, num_groups); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, - im2col_config.is_padding_required_nchw) + im2col_config.is_padding_required_nchw, num_groups) .first); return Status{}; } @@ -337,8 +347,6 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - const DataLayout data_layout = _input->info()->data_layout(); - // Get initial windows // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -353,7 +361,7 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) Window slice_in = first_slice_3d; Window slice_out = window_output.first_slice_window_2D(); - if(data_layout == DataLayout::NHWC) + if(_input->info()->data_layout() == DataLayout::NHWC) { const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); const int num_batches = tmp_win[3].end(); @@ -379,13 +387,21 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + _kernel.setArg(idx++, static_cast(_input->info()->strides_in_bytes()[3])); + _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)])); do { unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); - _kernel.setArg(idx++, static_cast(_input->info()->strides_in_bytes()[3])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); + if(_num_groups == 1) + { + add_2D_tensor_argument(idx, _output, slice_out); + } + else + { + add_3D_tensor_argument(idx, _output, slice_out); + } enqueue(queue, *this, slice, lws_hint()); } while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in)); diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 8cb4f4b889..98b1488a9d 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -45,12 +45,13 @@ using namespace arm_compute; namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation) + bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on NEON"); if(output->total_size() > 0) { @@ -290,13 +291,14 @@ NEIm2ColKernel::NEIm2ColKernel() } void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation) + bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Perform validation step ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation)); + ARM_COMPUTE_UNUSED(num_groups); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten)); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -378,9 +380,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size } Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation) + bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten)); return Status{}; } diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index a8d7058f2a..1ad8531920 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -181,7 +181,6 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; - TensorInfo tmp_output_info{}; // Get the GPU target const GPUTarget gpu_target = CLScheduler::get().target(); @@ -229,13 +228,12 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso } // Validate matrix multiply - auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target)); if(beta != 0 && c != nullptr) { // Validate matrix addition kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta)); } return Status{}; diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index ca6157ef13..26fd906dd1 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -391,7 +391,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI const Size2D kernel_dims(kernel_width, kernel_height); // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true); + TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true /* num_groups == 1, num_groups */); auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape)); diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp index 32edf93b63..1814d61e2f 100644 --- a/src/runtime/NEON/functions/NEFlattenLayer.cpp +++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp @@ -32,6 +32,6 @@ using namespace arm_compute; void NEFlattenLayer::configure(const ITensor *input, ITensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, false, true); + k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, false, true); _kernel = std::move(k); } \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index c2f0283d4e..f1606aa93e 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -118,7 +118,7 @@ void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITenso // Configure im2col kernel _memory_group.manage(&_im2col_output); - _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true); + _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true); // Configure matrix multiply kernel configure_mm(&_im2col_output, weights, output); @@ -308,7 +308,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate im2col kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true)); + ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true)); input_to_use = &im2col_input; } else diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index df4a040bad..33284470f4 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -233,7 +233,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig _memory_group.manage(&_im2col_output); // Configure - _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation); + _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation); // Update GEMM input gemm_input_to_use = &_im2col_output; @@ -401,7 +401,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); im2col_reshaped_info.set_quantization_info(input->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, false, false, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation)); gemm_input_to_use = &im2col_reshaped_info; } else if(append_bias) diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp index 6b95cb0256..4245b650e2 100644 --- a/src/runtime/NEON/functions/NEIm2Col.cpp +++ b/src/runtime/NEON/functions/NEIm2Col.cpp @@ -34,16 +34,18 @@ NEIm2Col::NEIm2Col() { } -void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten) +void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups, + bool is_fully_connected, bool is_flatten) { _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten); + _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten); } -Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten) +Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_groups, bool is_fully_connected, bool is_flatten) { - return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten); + return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten); } void NEIm2Col::run() diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp index 0737bd2f73..80a2541176 100644 --- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp @@ -113,7 +113,7 @@ Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensor TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type()); TensorInfo gemm_output_info(shape_gemm, 1, input->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false)); + ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias)); ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info)); ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info)); ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h))); diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index 4a6d778f9f..a1ca983186 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -635,6 +635,39 @@ public: } }; +/** Data set containing small grouped im2col tensor shapes. */ +class GroupedIm2ColSmallShapes final : public ShapeDataset +{ +public: + GroupedIm2ColSmallShapes() + : ShapeDataset("Shape", + { + TensorShape{ 11U, 11U, 48U }, + TensorShape{ 27U, 13U, 24U }, + TensorShape{ 128U, 64U, 12U, 3U }, + TensorShape{ 11U, 11U, 48U, 4U }, + TensorShape{ 27U, 13U, 24U, 4U }, + TensorShape{ 11U, 11U, 48U, 5U } + }) + { + } +}; + +/** Data set containing large grouped im2col tensor shapes. */ +class GroupedIm2ColLargeShapes final : public ShapeDataset +{ +public: + GroupedIm2ColLargeShapes() + : ShapeDataset("Shape", + { + TensorShape{ 1921U, 1083U, 12U }, + TensorShape{ 641U, 485U, 24U, 3U }, + TensorShape{ 799U, 595U, 12U, 4U }, + }) + { + } +}; + /** Data set containing small grouped weights tensor shapes. */ class GroupedWeightsSmallShapes final : public ShapeDataset { diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp index e8019cca92..cf7c79ad72 100644 --- a/tests/validation/CL/Im2Col.cpp +++ b/tests/validation/CL/Im2Col.cpp @@ -53,9 +53,14 @@ const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3 const auto padstrides = framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) }); -const auto conv_args = combine(combine(combine(conv_filter_sizes, padstrides), - framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })); +const auto conv_args = combine(combine(combine(combine(conv_filter_sizes, padstrides), + framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + framework::dataset::make("NumGroups", { 1 })); +const auto grouped_args = combine(combine(combine(combine(conv_filter_sizes, padstrides), + framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))), + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("NumGroups", { 2, 3, 4 })); } // namespace TEST_SUITE(CL) @@ -96,7 +101,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode: // Validate output validate(CLAccessor(_target), _reference); } -TEST_SUITE_END() FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), conv_args), @@ -105,8 +109,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode: // Validate output validate(CLAccessor(_target), _reference); } - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +TEST_SUITE_END() TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), @@ -124,9 +127,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode:: validate(CLAccessor(_target), _reference); } TEST_SUITE_END() - -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - TEST_SUITE_END() TEST_SUITE(QASYMM8) @@ -146,6 +146,68 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMod } TEST_SUITE_END() +TEST_SUITE(Grouped) +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType", + DataType::F32)), + grouped_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType", + DataType::F32)), + grouped_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() + +TEST_SUITE(FP16) +FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType", + DataType::F16)), + grouped_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType", + DataType::F16)), + grouped_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() + +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType", + DataType::QASYMM8)), + grouped_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType", + DataType::QASYMM8)), + grouped_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() +TEST_SUITE_END() + TEST_SUITE_END() TEST_SUITE_END() } // namespace validation diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp index f011ebe935..0ea68bf49d 100644 --- a/tests/validation/NEON/Im2Col.cpp +++ b/tests/validation/NEON/Im2Col.cpp @@ -40,9 +40,10 @@ namespace validation namespace { const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(3U, 1U), Size2D(1U, 5U), Size2D(5U, 5U), Size2D(7U, 7U) }); -const auto conv_args = combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })), - framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })); +const auto conv_args = combine(combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })), + framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + framework::dataset::make("NumGroups", { 1 })); } // namespace TEST_SUITE(NEON) TEST_SUITE(Im2Col) @@ -66,7 +67,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( framework::dataset::make("Expected", { false, false, false, false, true })), input_info, output_info, has_bias, expected) { - bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias, false, false)); + bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias)); ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS); } // clang-format on diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h index da2576b37c..b5e83a9872 100644 --- a/tests/validation/fixtures/Im2ColFixture.h +++ b/tests/validation/fixtures/Im2ColFixture.h @@ -50,13 +50,14 @@ class Im2ColValidationFixture : public framework::Fixture public: template void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout, - bool channels_first_output_nhwc) + unsigned int num_groups, bool channels_first_output_nhwc) { _kernel_dims = kernel_dims; _conv_info = conv_info; _quant_info = quant_info; _data_layout = data_layout; _has_bias = data_type != DataType::QASYMM8; + _num_groups = num_groups; if(_data_layout == DataLayout::NHWC) { @@ -66,7 +67,7 @@ public: TensorInfo input_info(input_shape, 1, data_type); input_info.set_data_layout(_data_layout); - const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z); + const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z && _num_groups == 1, _num_groups); _target = compute_target(input_shape, output_shape, data_type); compute_reference(input_shape, output_shape, data_type, channels_first_output_nhwc); @@ -87,7 +88,7 @@ protected: // Create and configure function FunctionType im2col_func; - im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias); + im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), _num_groups); ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); @@ -117,7 +118,7 @@ protected: // Fill reference fill(src); - reference::im2col(src, _reference, _kernel_dims, _conv_info, _has_bias, channels_first_output_nhwc); + reference::im2col(src, _reference, _kernel_dims, _conv_info, _has_bias, _num_groups, channels_first_output_nhwc); } TensorType _target{}; SimpleTensor _reference{}; @@ -126,6 +127,7 @@ protected: DataLayout _data_layout{}; QuantizationInfo _quant_info{}; bool _has_bias{}; + unsigned int _num_groups{}; }; } // namespace validation } // namespace test diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp index 2459499474..0c41d88f3e 100644 --- a/tests/validation/reference/Im2Col.cpp +++ b/tests/validation/reference/Im2Col.cpp @@ -36,7 +36,7 @@ namespace validation namespace reference { template -void im2col_nchw(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +void im2col_nchw(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW); const int stride_x = conv_info.stride().first; @@ -58,26 +58,32 @@ void im2col_nchw(const SimpleTensor &src, SimpleTensor &dst, const Size2D for(int b = 0; b < batches; ++b) { - for(int yo = 0; yo < dst_height; ++yo) + for(int g = 0; g < static_cast(num_groups); ++g) { - // Compute input spatial coordinates - const int xi = (yo % convolved_dims.first) * stride_x; - const int yi = (yo / convolved_dims.first) * stride_y; + const int first_group_ch = g * (src_channels / num_groups); + const int last_group_ch = (g + 1) * (src_channels / num_groups); - for(int ci = 0; ci < src_channels; ++ci) + for(int yo = 0; yo < dst_height; ++yo) { - for(int yk = 0; yk < kernel_height; ++yk) + // Compute input spatial coordinates + const int xi = (yo % convolved_dims.first) * stride_x; + const int yi = (yo / convolved_dims.first) * stride_y; + + for(int ci = first_group_ch; ci < last_group_ch; ++ci) { - for(int xk = 0; xk < kernel_width; ++xk) + for(int yk = 0; yk < kernel_height; ++yk) { - dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast(pad_val)); + for(int xk = 0; xk < kernel_width; ++xk) + { + dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast(pad_val)); + } } } - } - if(has_bias) - { - dst[dst_idx++] = static_cast(1); + if(has_bias) + { + dst[dst_idx++] = static_cast(1); + } } } } @@ -179,13 +185,13 @@ void im2col_nhwc_channel_first(const SimpleTensor &src, SimpleTensor &dst, } template -void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc) +void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups, bool channels_first_output_nhwc) { switch(src.data_layout()) { case DataLayout::NCHW: { - im2col_nchw(src, dst, kernel_dims, conv_info, has_bias); + im2col_nchw(src, dst, kernel_dims, conv_info, has_bias, num_groups); break; } case DataLayout::NHWC: @@ -208,9 +214,12 @@ void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kern } } -template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc); -template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc); -template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc); +template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups, + bool channels_first_output_nhwc); +template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups, + bool channels_first_output_nhwc); +template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups, + bool channels_first_output_nhwc); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h index b1ebaf25da..84ee237453 100644 --- a/tests/validation/reference/Im2Col.h +++ b/tests/validation/reference/Im2Col.h @@ -35,7 +35,8 @@ namespace validation namespace reference { template -void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc = false); +void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups, + bool channels_first_output_nhwc = false); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1