aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2018-07-18 16:13:12 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit0f170396e84836ad8c54d54421e95c61812968be (patch)
treeb8993f251e3c023aca2856b2784e67eb9f11c8a4
parentb6eb35371d222c6b7f61210d97ebd7dd9e197458 (diff)
downloadComputeLibrary-0f170396e84836ad8c54d54421e95c61812968be.tar.gz
COMPMID-1342 Add grouping support to CLIm2ColKernel
Change-Id: I4afb19751520a90fee27fb49b775cd10e92a94f5 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140476 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/CL/kernels/CLIm2ColKernel.h9
-rw-r--r--arm_compute/core/NEON/kernels/NEIm2ColKernel.h12
-rw-r--r--arm_compute/core/utils/misc/ShapeCalculator.h14
-rw-r--r--arm_compute/runtime/NEON/functions/NEIm2Col.h14
-rw-r--r--src/core/CL/cl_kernels/im2col.cl144
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp56
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp12
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp6
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp2
-rw-r--r--src/runtime/NEON/functions/NEFlattenLayer.cpp2
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp4
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp4
-rw-r--r--src/runtime/NEON/functions/NEIm2Col.cpp10
-rw-r--r--src/runtime/NEON/functions/NELocallyConnectedLayer.cpp2
-rw-r--r--tests/datasets/ShapeDatasets.h33
-rw-r--r--tests/validation/CL/Im2Col.cpp80
-rw-r--r--tests/validation/NEON/Im2Col.cpp9
-rw-r--r--tests/validation/fixtures/Im2ColFixture.h10
-rw-r--r--tests/validation/reference/Im2Col.cpp45
-rw-r--r--tests/validation/reference/Im2Col.h3
20 files changed, 368 insertions, 103 deletions
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index ae19319047..c678f277cb 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -76,8 +76,10 @@ public:
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] has_bias In case biases are provided expands the matrix with 1.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
*/
- void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U));
+ void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
@@ -88,10 +90,12 @@ public:
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] has_bias In case biases are provided expands the matrix with 1.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U));
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
@@ -103,6 +107,7 @@ public:
unsigned int _num_elems_processed_per_iteration;
Size2D _kernel_dims;
PadStrideInfo _conv_info;
+ unsigned int _num_groups;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 19da7cfd53..37145a38c1 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -83,12 +83,13 @@ public:
* @param[in] kernel_dims The kernel dimensions (width and height).
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] has_bias In case biases are provided expands the matrix with 1.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
* @param[in] is_fully_connected (Optional) Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments
* @param[in] is_flatten (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
*/
void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected = false, bool is_flatten = false, const Size2D &dilation = Size2D(1U, 1U));
+ bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false);
/** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
@@ -98,14 +99,15 @@ public:
* @param[in] kernel_dims The kernel dimensions (width and height).
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] has_bias In case biases are provided expands the matrix with 1.
- * @param[in] is_fully_connected (Optional) Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments
- * @param[in] is_flatten (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
+ * @param[in] is_fully_connected (Optional)Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments
+ * @param[in] is_flatten (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments
*
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected = false, bool is_flatten = false, const Size2D &dilation = Size2D(1U, 1U));
+ bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index f726ce9ad3..fc6abf95f3 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -192,9 +192,15 @@ inline TensorShape compute_deconvolution_shape(const ITensorInfo &input, unsigne
return scale_out_shape;
}
-inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z)
+inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z,
+ unsigned int num_groups = 1)
{
- // The output shape will be the 2D shape used as input for GEMM [ out_channels * kernel_area, num_elems_per_out_channel ]
+ // The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ] if batch_size_on_z == true
+ // or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ] if batch_size_on_z == false
+
+ ARM_COMPUTE_ERROR_ON(num_groups == 0);
+ ARM_COMPUTE_ERROR_ON(num_groups > 1 && input->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_ERROR_ON(num_groups > 1 && batch_size_on_z);
TensorShape output_shape{ input->tensor_shape() };
@@ -204,7 +210,7 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz
const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
- output_shape.set(0, (output_shape[channel_idx] * kernel_dims.area() + (has_bias ? 1 : 0)));
+ output_shape.set(0, (output_shape[channel_idx] / num_groups * kernel_dims.area() + (has_bias ? 1 : 0))); // NOLINT
output_shape.set(1, (out_dims.first * out_dims.second));
if(batch_size_on_z && output_shape.num_dimensions() >= 3)
{
@@ -212,7 +218,7 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz
}
else
{
- output_shape.set(2, 1);
+ output_shape.set(2, num_groups);
}
return output_shape;
diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h
index d888b7e8f5..9df4f070d8 100644
--- a/arm_compute/runtime/NEON/functions/NEIm2Col.h
+++ b/arm_compute/runtime/NEON/functions/NEIm2Col.h
@@ -49,10 +49,13 @@ public:
* @param[in] kernel_dims The kernel dimensions (width and height).
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] has_bias In case biases are provided expands the matrix with 1.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
* @param[in] is_fully_connected (Optional) Determines whether this function will be called by @ref NEFullyConnectedLayer in order to validate the arguments
* @param[in] is_flatten (Optional) Determines whether this function will be called by @ref NEFlattenLayer in order to validate the arguments
*/
- void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected = false, bool is_flatten = false);
+ void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false);
/** Static function to check if given info will lead to a valid configuration of @ref NEIm2Col
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
@@ -62,12 +65,15 @@ public:
* @param[in] kernel_dims The kernel dimensions (width and height).
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] has_bias In case biases are provided expands the matrix with 1.
- * @param[in] is_fully_connected Determines whether this function will be called by @ref NEFullyConnectedLayer in order to validate the arguments
- * @param[in] is_flatten Determines whether this function will be called by @ref NEFlattenLayer in order to validate the arguments
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
+ * @param[in] is_fully_connected (Optional) Determines whether this function will be called by @ref NEFullyConnectedLayer in order to validate the arguments
+ * @param[in] is_flatten (Optional) Determines whether this function will be called by @ref NEFlattenLayer in order to validate the arguments
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1, bool is_fully_connected = false, bool is_flatten = false);
// Inherited methods overridden:
void run() override;
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 5db1d6ce33..186d5a80ad 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -43,6 +43,7 @@
* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -57,13 +58,19 @@
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
__kernel void im2col1x1_stridex1_nchw(
TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
uint src_stride_w,
uint dst_stride_w)
{
@@ -86,13 +93,22 @@ __kernel void im2col1x1_stridex1_nchw(
const uint yi = yc * STRIDE_Y;
// Calculate output indices
- const uint xo = ch;
+
+#if defined(NUM_GROUPS)
+ const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
+ const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const uint xo = ch;
+#endif // defined(NUM_GROUPS)
const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
// Get input and output address
__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
VEC_DATA_TYPE(DATA_TYPE, 4)
data = vload4(0, (__global DATA_TYPE *)input_ptr);
@@ -106,7 +122,11 @@ __kernel void im2col1x1_stridex1_nchw(
*(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
{
*((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
*((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
@@ -130,6 +150,7 @@ __kernel void im2col1x1_stridex1_nchw(
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -144,13 +165,19 @@ __kernel void im2col1x1_stridex1_nchw(
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
__kernel void im2col_generic_nchw(
TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
uint src_stride_w,
uint dst_stride_w)
{
@@ -164,11 +191,20 @@ __kernel void im2col_generic_nchw(
const int yi = yc * STRIDE_Y - PAD_TOP;
// Calculate output indices
- const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif // defined(NUM_GROUPS)
const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else // defined(NUM_GROUPS)
__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
// Linearize convolution elements
for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
@@ -193,7 +229,11 @@ __kernel void im2col_generic_nchw(
}
#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
{
*output_ptr = 1.0f;
}
@@ -225,13 +265,19 @@ __kernel void im2col_generic_nchw(
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
__kernel void im2col3x3_nchw(
TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
uint src_stride_w,
uint dst_stride_w)
{
@@ -245,13 +291,21 @@ __kernel void im2col3x3_nchw(
const int yi = yc * STRIDE_Y - PAD_TOP;
// Calculate output indices
- const int xo = ch * 9; // 3x3
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * 9; // 3x3
+#endif // defined(NUM_GROUPS)
const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
// Get input and output address
__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
VEC_DATA_TYPE(DATA_TYPE, 3)
row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
@@ -281,7 +335,11 @@ __kernel void im2col3x3_nchw(
*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
{
*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
}
@@ -298,6 +356,7 @@ __kernel void im2col3x3_nchw(
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -312,13 +371,19 @@ __kernel void im2col3x3_nchw(
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
__kernel void im2col5x5_nchw(
TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
uint src_stride_w,
uint dst_stride_w)
{
@@ -332,7 +397,12 @@ __kernel void im2col5x5_nchw(
const int yi = yc * STRIDE_Y - PAD_TOP;
// Calculate output indices
- const int xo = ch * 25; // 5x5
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * 25; // 5x5
+#endif // defined(NUM_GROUPS)
const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
@@ -353,8 +423,11 @@ __kernel void im2col5x5_nchw(
// Get input and output address
__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
{
VEC_DATA_TYPE(DATA_TYPE, 4)
@@ -455,7 +528,11 @@ __kernel void im2col5x5_nchw(
}
#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
{
*((__global DATA_TYPE *)output_ptr) = 1.0f;
}
@@ -471,6 +548,7 @@ __kernel void im2col5x5_nchw(
* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -485,13 +563,19 @@ __kernel void im2col5x5_nchw(
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
__kernel void im2col11x11_padx0_pady0_nchw(
TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
uint src_stride_w,
uint dst_stride_w)
{
@@ -505,13 +589,22 @@ __kernel void im2col11x11_padx0_pady0_nchw(
const int yi = yc * STRIDE_Y;
// Calculate output indices
- const int xo = ch * 121; // 11x11
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * 121; // 11x11
+#endif // defined(NUM_GROUPS)
const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
// Get input and output address
__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
{
VEC_DATA_TYPE(DATA_TYPE, 8)
row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
@@ -655,7 +748,11 @@ __kernel void im2col11x11_padx0_pady0_nchw(
}
#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
{
*((__global DATA_TYPE *)output_ptr) = 1.0f;
}
@@ -671,6 +768,7 @@ __kernel void im2col11x11_padx0_pady0_nchw(
* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -685,13 +783,19 @@ __kernel void im2col11x11_padx0_pady0_nchw(
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
__kernel void im2col_generic_padx0_pady0_nchw(
TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
uint src_stride_w,
uint dst_stride_w)
{
@@ -703,11 +807,23 @@ __kernel void im2col_generic_padx0_pady0_nchw(
// Calculate input indices
const int xi = xc * STRIDE_X;
const int yi = yc * STRIDE_Y;
+
// Calculate output indices
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
- const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#endif // defined(NUM_GROUPS)
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else // defined(NUM_GROUPS)
__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
// Linearize convolution elements
for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
{
@@ -734,7 +850,11 @@ __kernel void im2col_generic_padx0_pady0_nchw(
} /* End of loop over KERNEL_HEIGHT */
#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
{
*output_ptr = 1.0f;
}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 42bb96c16f..3d71567eee 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -54,18 +54,24 @@ struct Im2ColConfiguration
bool is_padding_required_nchw{};
};
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+ unsigned int num_groups)
{
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0);
if(output->total_size() > 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true));
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -74,12 +80,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw)
+ unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output tensor auto initialization if not yet initialized
- TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true);
+ TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
@@ -141,7 +147,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
return std::make_pair(err, win);
}
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
{
const DataLayout data_layout = input->data_layout();
const DataType data_type = input->data_type();
@@ -177,6 +183,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+ build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->quantization_info().offset), "-DPAD_VALUE=0");
build_opts.add_option_if(has_bias, "-DHAS_BIAS");
@@ -274,14 +281,15 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
} // namespace
CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info()
+ : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
{
}
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
const DataLayout data_layout = input->info()->data_layout();
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -292,7 +300,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
// Select and configure the optimal OpenCL kernel to run.
// This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
// and the padding requirement flag
- Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation);
+ Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation, num_groups);
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(im2col_config.kernel_name, im2col_config.build_options));
@@ -303,10 +311,11 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
_num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
_kernel_dims = kernel_dims; // Only needed by the Tuner
_conv_info = conv_info; // Only needed by the Tuner
+ _num_groups = num_groups;
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
- im2col_config.is_padding_required_nchw);
+ im2col_config.is_padding_required_nchw, num_groups);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
@@ -322,12 +331,13 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
_config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
}
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+ unsigned int num_groups)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation));
- Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
+ Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation, num_groups);
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
- im2col_config.is_padding_required_nchw)
+ im2col_config.is_padding_required_nchw, num_groups)
.first);
return Status{};
}
@@ -337,8 +347,6 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
-
// Get initial windows
// Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -353,7 +361,7 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
Window slice_in = first_slice_3d;
Window slice_out = window_output.first_slice_window_2D();
- if(data_layout == DataLayout::NHWC)
+ if(_input->info()->data_layout() == DataLayout::NHWC)
{
const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
const int num_batches = tmp_win[3].end();
@@ -379,13 +387,21 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
do
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice_out);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+ if(_num_groups == 1)
+ {
+ add_2D_tensor_argument(idx, _output, slice_out);
+ }
+ else
+ {
+ add_3D_tensor_argument(idx, _output, slice_out);
+ }
enqueue(queue, *this, slice, lws_hint());
}
while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 8cb4f4b889..98b1488a9d 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -45,12 +45,13 @@ using namespace arm_compute;
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
+ bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on NEON");
if(output->total_size() > 0)
{
@@ -290,13 +291,14 @@ NEIm2ColKernel::NEIm2ColKernel()
}
void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
+ bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Perform validation step
ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
+ ARM_COMPUTE_UNUSED(num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
const DataLayout data_layout = input->info()->data_layout();
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -378,9 +380,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
}
Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
+ bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
return Status{};
}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index a8d7058f2a..1ad8531920 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -181,7 +181,6 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
- TensorInfo tmp_output_info{};
// Get the GPU target
const GPUTarget gpu_target = CLScheduler::get().target();
@@ -229,13 +228,12 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
}
// Validate matrix multiply
- auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target));
if(beta != 0 && c != nullptr)
{
// Validate matrix addition kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
}
return Status{};
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ca6157ef13..26fd906dd1 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -391,7 +391,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
const Size2D kernel_dims(kernel_width, kernel_height);
// Output tensor auto initialization if not yet initialized
- TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true);
+ TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true /* num_groups == 1, num_groups */);
auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 32edf93b63..1814d61e2f 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -32,6 +32,6 @@ using namespace arm_compute;
void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
{
auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
- k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, false, true);
+ k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, false, true);
_kernel = std::move(k);
} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index c2f0283d4e..f1606aa93e 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -118,7 +118,7 @@ void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITenso
// Configure im2col kernel
_memory_group.manage(&_im2col_output);
- _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true);
// Configure matrix multiply kernel
configure_mm(&_im2col_output, weights, output);
@@ -308,7 +308,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
// Validate im2col kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true));
input_to_use = &im2col_input;
}
else
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index df4a040bad..33284470f4 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -233,7 +233,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
_memory_group.manage(&_im2col_output);
// Configure
- _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
// Update GEMM input
gemm_input_to_use = &_im2col_output;
@@ -401,7 +401,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
im2col_reshaped_info.set_quantization_info(input->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, false, false, dilation));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
gemm_input_to_use = &im2col_reshaped_info;
}
else if(append_bias)
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 6b95cb0256..4245b650e2 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -34,16 +34,18 @@ NEIm2Col::NEIm2Col()
{
}
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups,
+ bool is_fully_connected, bool is_flatten)
{
_y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+ _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
}
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+ unsigned int num_groups, bool is_fully_connected, bool is_flatten)
{
- return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+ return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
}
void NEIm2Col::run()
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 0737bd2f73..80a2541176 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -113,7 +113,7 @@ Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensor
TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 4a6d778f9f..a1ca983186 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -635,6 +635,39 @@ public:
}
};
+/** Data set containing small grouped im2col tensor shapes. */
+class GroupedIm2ColSmallShapes final : public ShapeDataset
+{
+public:
+ GroupedIm2ColSmallShapes()
+ : ShapeDataset("Shape",
+ {
+ TensorShape{ 11U, 11U, 48U },
+ TensorShape{ 27U, 13U, 24U },
+ TensorShape{ 128U, 64U, 12U, 3U },
+ TensorShape{ 11U, 11U, 48U, 4U },
+ TensorShape{ 27U, 13U, 24U, 4U },
+ TensorShape{ 11U, 11U, 48U, 5U }
+ })
+ {
+ }
+};
+
+/** Data set containing large grouped im2col tensor shapes. */
+class GroupedIm2ColLargeShapes final : public ShapeDataset
+{
+public:
+ GroupedIm2ColLargeShapes()
+ : ShapeDataset("Shape",
+ {
+ TensorShape{ 1921U, 1083U, 12U },
+ TensorShape{ 641U, 485U, 24U, 3U },
+ TensorShape{ 799U, 595U, 12U, 4U },
+ })
+ {
+ }
+};
+
/** Data set containing small grouped weights tensor shapes. */
class GroupedWeightsSmallShapes final : public ShapeDataset
{
diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp
index e8019cca92..cf7c79ad72 100644
--- a/tests/validation/CL/Im2Col.cpp
+++ b/tests/validation/CL/Im2Col.cpp
@@ -53,9 +53,14 @@ const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3
const auto padstrides = framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U),
PadStrideInfo(1U, 1U, 1U, 1U),
PadStrideInfo(2U, 2U, 0U, 2U) });
-const auto conv_args = combine(combine(combine(conv_filter_sizes, padstrides),
- framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
- framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
+const auto conv_args = combine(combine(combine(combine(conv_filter_sizes, padstrides),
+ framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ framework::dataset::make("NumGroups", { 1 }));
+const auto grouped_args = combine(combine(combine(combine(conv_filter_sizes, padstrides),
+ framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+ framework::dataset::make("NumGroups", { 2, 3, 4 }));
} // namespace
TEST_SUITE(CL)
@@ -96,7 +101,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<float>, framework::DatasetMode:
// Validate output
validate(CLAccessor(_target), _reference);
}
-TEST_SUITE_END()
FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
conv_args),
@@ -105,8 +109,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode:
// Validate output
validate(CLAccessor(_target), _reference);
}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE_END()
TEST_SUITE(FP16)
FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
@@ -124,9 +127,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<half>, framework::DatasetMode::
validate(CLAccessor(_target), _reference);
}
TEST_SUITE_END()
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
TEST_SUITE_END()
TEST_SUITE(QASYMM8)
@@ -146,6 +146,68 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<uint8_t>, framework::DatasetMod
}
TEST_SUITE_END()
+TEST_SUITE(Grouped)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType",
+ DataType::F32)),
+ grouped_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType",
+ DataType::F32)),
+ grouped_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType",
+ DataType::F16)),
+ grouped_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType",
+ DataType::F16)),
+ grouped_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType",
+ DataType::QASYMM8)),
+ grouped_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType",
+ DataType::QASYMM8)),
+ grouped_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
TEST_SUITE_END()
TEST_SUITE_END()
} // namespace validation
diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index f011ebe935..0ea68bf49d 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp
@@ -40,9 +40,10 @@ namespace validation
namespace
{
const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(3U, 1U), Size2D(1U, 5U), Size2D(5U, 5U), Size2D(7U, 7U) });
-const auto conv_args = combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })),
- framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
- framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
+const auto conv_args = combine(combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })),
+ framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ framework::dataset::make("NumGroups", { 1 }));
} // namespace
TEST_SUITE(NEON)
TEST_SUITE(Im2Col)
@@ -66,7 +67,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
framework::dataset::make("Expected", { false, false, false, false, true })),
input_info, output_info, has_bias, expected)
{
- bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias, false, false));
+ bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
}
// clang-format on
diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h
index da2576b37c..b5e83a9872 100644
--- a/tests/validation/fixtures/Im2ColFixture.h
+++ b/tests/validation/fixtures/Im2ColFixture.h
@@ -50,13 +50,14 @@ class Im2ColValidationFixture : public framework::Fixture
public:
template <typename...>
void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout,
- bool channels_first_output_nhwc)
+ unsigned int num_groups, bool channels_first_output_nhwc)
{
_kernel_dims = kernel_dims;
_conv_info = conv_info;
_quant_info = quant_info;
_data_layout = data_layout;
_has_bias = data_type != DataType::QASYMM8;
+ _num_groups = num_groups;
if(_data_layout == DataLayout::NHWC)
{
@@ -66,7 +67,7 @@ public:
TensorInfo input_info(input_shape, 1, data_type);
input_info.set_data_layout(_data_layout);
- const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z);
+ const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z && _num_groups == 1, _num_groups);
_target = compute_target(input_shape, output_shape, data_type);
compute_reference(input_shape, output_shape, data_type, channels_first_output_nhwc);
@@ -87,7 +88,7 @@ protected:
// Create and configure function
FunctionType im2col_func;
- im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias);
+ im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), _num_groups);
ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -117,7 +118,7 @@ protected:
// Fill reference
fill(src);
- reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias, channels_first_output_nhwc);
+ reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias, _num_groups, channels_first_output_nhwc);
}
TensorType _target{};
SimpleTensor<T> _reference{};
@@ -126,6 +127,7 @@ protected:
DataLayout _data_layout{};
QuantizationInfo _quant_info{};
bool _has_bias{};
+ unsigned int _num_groups{};
};
} // namespace validation
} // namespace test
diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
index 2459499474..0c41d88f3e 100644
--- a/tests/validation/reference/Im2Col.cpp
+++ b/tests/validation/reference/Im2Col.cpp
@@ -36,7 +36,7 @@ namespace validation
namespace reference
{
template <typename T>
-void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
const int stride_x = conv_info.stride().first;
@@ -58,26 +58,32 @@ void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D
for(int b = 0; b < batches; ++b)
{
- for(int yo = 0; yo < dst_height; ++yo)
+ for(int g = 0; g < static_cast<int>(num_groups); ++g)
{
- // Compute input spatial coordinates
- const int xi = (yo % convolved_dims.first) * stride_x;
- const int yi = (yo / convolved_dims.first) * stride_y;
+ const int first_group_ch = g * (src_channels / num_groups);
+ const int last_group_ch = (g + 1) * (src_channels / num_groups);
- for(int ci = 0; ci < src_channels; ++ci)
+ for(int yo = 0; yo < dst_height; ++yo)
{
- for(int yk = 0; yk < kernel_height; ++yk)
+ // Compute input spatial coordinates
+ const int xi = (yo % convolved_dims.first) * stride_x;
+ const int yi = (yo / convolved_dims.first) * stride_y;
+
+ for(int ci = first_group_ch; ci < last_group_ch; ++ci)
{
- for(int xk = 0; xk < kernel_width; ++xk)
+ for(int yk = 0; yk < kernel_height; ++yk)
{
- dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+ for(int xk = 0; xk < kernel_width; ++xk)
+ {
+ dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+ }
}
}
- }
- if(has_bias)
- {
- dst[dst_idx++] = static_cast<T>(1);
+ if(has_bias)
+ {
+ dst[dst_idx++] = static_cast<T>(1);
+ }
}
}
}
@@ -179,13 +185,13 @@ void im2col_nhwc_channel_first(const SimpleTensor<T> &src, SimpleTensor<T> &dst,
}
template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc)
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups, bool channels_first_output_nhwc)
{
switch(src.data_layout())
{
case DataLayout::NCHW:
{
- im2col_nchw(src, dst, kernel_dims, conv_info, has_bias);
+ im2col_nchw(src, dst, kernel_dims, conv_info, has_bias, num_groups);
break;
}
case DataLayout::NHWC:
@@ -208,9 +214,12 @@ void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kern
}
}
-template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
-template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
-template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+ bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+ bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+ bool channels_first_output_nhwc);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
index b1ebaf25da..84ee237453 100644
--- a/tests/validation/reference/Im2Col.h
+++ b/tests/validation/reference/Im2Col.h
@@ -35,7 +35,8 @@ namespace validation
namespace reference
{
template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc = false);
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups,
+ bool channels_first_output_nhwc = false);
} // namespace reference
} // namespace validation
} // namespace test