From 0f170396e84836ad8c54d54421e95c61812968be Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Wed, 18 Jul 2018 16:13:12 +0100
Subject: COMPMID-1342 Add grouping support to CLIm2ColKernel

Change-Id: I4afb19751520a90fee27fb49b775cd10e92a94f5
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140476
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/im2col.cl                   | 144 +++++++++++++++++++--
 src/core/CL/kernels/CLIm2ColKernel.cpp             |  56 +++++---
 src/core/NEON/kernels/NEIm2ColKernel.cpp           |  12 +-
 src/runtime/CL/functions/CLGEMM.cpp                |   6 +-
 .../CL/functions/CLGEMMConvolutionLayer.cpp        |   2 +-
 src/runtime/NEON/functions/NEFlattenLayer.cpp      |   2 +-
 .../NEON/functions/NEFullyConnectedLayer.cpp       |   4 +-
 .../NEON/functions/NEGEMMConvolutionLayer.cpp      |   4 +-
 src/runtime/NEON/functions/NEIm2Col.cpp            |  10 +-
 .../NEON/functions/NELocallyConnectedLayer.cpp     |   2 +-
 10 files changed, 190 insertions(+), 52 deletions(-)

(limited to 'src')

diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 5db1d6ce33..186d5a80ad 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -43,6 +43,7 @@
  * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
  * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -57,13 +58,19 @@
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col1x1_stridex1_nchw(
     TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
     IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
     uint src_stride_w,
     uint dst_stride_w)
 {
@@ -86,13 +93,22 @@ __kernel void im2col1x1_stridex1_nchw(
     const uint yi = yc * STRIDE_Y;
 
     // Calculate output indices
-    const uint  xo = ch;
+
+#if defined(NUM_GROUPS)
+    const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
+    const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                                   // defined(NUM_GROUPS)
+    const uint xo              = ch;
+#endif                                                  // defined(NUM_GROUPS)
     const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
 
     // Get input and output address
     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
 
     VEC_DATA_TYPE(DATA_TYPE, 4)
     data = vload4(0, (__global DATA_TYPE *)input_ptr);
@@ -106,7 +122,11 @@ __kernel void im2col1x1_stridex1_nchw(
     *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
 
 #ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
     if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
     {
         *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
         *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
@@ -130,6 +150,7 @@ __kernel void im2col1x1_stridex1_nchw(
  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
  * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -144,13 +165,19 @@ __kernel void im2col1x1_stridex1_nchw(
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col_generic_nchw(
     TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
     IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
     uint src_stride_w,
     uint dst_stride_w)
 {
@@ -164,11 +191,20 @@ __kernel void im2col_generic_nchw(
     const int yi = yc * STRIDE_Y - PAD_TOP;
 
     // Calculate output indices
-    const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif                                        // defined(NUM_GROUPS)
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
-    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
 
     // Linearize convolution elements
     for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
@@ -193,7 +229,11 @@ __kernel void im2col_generic_nchw(
     }
 
 #ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
     if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
     {
         *output_ptr = 1.0f;
     }
@@ -225,13 +265,19 @@ __kernel void im2col_generic_nchw(
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col3x3_nchw(
     TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
     IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
     uint src_stride_w,
     uint dst_stride_w)
 {
@@ -245,13 +291,21 @@ __kernel void im2col3x3_nchw(
     const int yi = yc * STRIDE_Y - PAD_TOP;
 
     // Calculate output indices
-    const int xo = ch * 9;                    // 3x3
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 9; // 3x3
+#endif                                        // defined(NUM_GROUPS)
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
     // Get input and output address
     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
 
     VEC_DATA_TYPE(DATA_TYPE, 3)
     row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
@@ -281,7 +335,11 @@ __kernel void im2col3x3_nchw(
     *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
 
 #ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
     if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
     {
         *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
     }
@@ -298,6 +356,7 @@ __kernel void im2col3x3_nchw(
  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -312,13 +371,19 @@ __kernel void im2col3x3_nchw(
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col5x5_nchw(
     TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
     IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
     uint src_stride_w,
     uint dst_stride_w)
 {
@@ -332,7 +397,12 @@ __kernel void im2col5x5_nchw(
     const int yi = yc * STRIDE_Y - PAD_TOP;
 
     // Calculate output indices
-    const int xo = ch * 25;                   // 5x5
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 25; // 5x5
+#endif                                        // defined(NUM_GROUPS)
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
@@ -353,8 +423,11 @@ __kernel void im2col5x5_nchw(
 
     // Get input and output address
     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
 
     {
         VEC_DATA_TYPE(DATA_TYPE, 4)
@@ -455,7 +528,11 @@ __kernel void im2col5x5_nchw(
     }
 
 #ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
     if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
     {
         *((__global DATA_TYPE *)output_ptr) = 1.0f;
     }
@@ -471,6 +548,7 @@ __kernel void im2col5x5_nchw(
  * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -485,13 +563,19 @@ __kernel void im2col5x5_nchw(
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col11x11_padx0_pady0_nchw(
     TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
     IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
     uint src_stride_w,
     uint dst_stride_w)
 {
@@ -505,13 +589,22 @@ __kernel void im2col11x11_padx0_pady0_nchw(
     const int yi = yc * STRIDE_Y;
 
     // Calculate output indices
-    const int xo = ch * 121;                  // 11x11
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 121; // 11x11
+#endif                                        // defined(NUM_GROUPS)
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
     // Get input and output address
     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
     {
         VEC_DATA_TYPE(DATA_TYPE, 8)
         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
@@ -655,7 +748,11 @@ __kernel void im2col11x11_padx0_pady0_nchw(
     }
 
 #ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
     if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
     {
         *((__global DATA_TYPE *)output_ptr) = 1.0f;
     }
@@ -671,6 +768,7 @@ __kernel void im2col11x11_padx0_pady0_nchw(
  * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -685,13 +783,19 @@ __kernel void im2col11x11_padx0_pady0_nchw(
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col_generic_padx0_pady0_nchw(
     TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
     IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
     uint src_stride_w,
     uint dst_stride_w)
 {
@@ -703,11 +807,23 @@ __kernel void im2col_generic_padx0_pady0_nchw(
     // Calculate input indices
     const int xi = xc * STRIDE_X;
     const int yi = yc * STRIDE_Y;
+
     // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
     const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int yo                   = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
     // Linearize convolution elements
     for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
     {
@@ -734,7 +850,11 @@ __kernel void im2col_generic_padx0_pady0_nchw(
     } /* End of loop over KERNEL_HEIGHT */
 
 #ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
     if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
     {
         *output_ptr = 1.0f;
     }
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 42bb96c16f..3d71567eee 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -54,18 +54,24 @@ struct Im2ColConfiguration
     bool                  is_padding_required_nchw{};
 };
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+                          unsigned int num_groups)
 {
+    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0);
 
     if(output->total_size() > 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true));
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -74,12 +80,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw)
+                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true);
+    TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
 
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
 
@@ -141,7 +147,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     return std::make_pair(err, win);
 }
 
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
     const DataLayout   data_layout   = input->data_layout();
     const DataType     data_type     = input->data_type();
@@ -177,6 +183,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
     build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
     build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+    build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
     build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->quantization_info().offset), "-DPAD_VALUE=0");
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
 
@@ -274,14 +281,15 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
 } // namespace
 
 CLIm2ColKernel::CLIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info()
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
 {
 }
 
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+                               unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
 
     const DataLayout   data_layout  = input->info()->data_layout();
     const unsigned int width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -292,7 +300,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     // Select and configure the optimal OpenCL kernel to run.
     // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
     // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation);
+    Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation, num_groups);
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(im2col_config.kernel_name, im2col_config.build_options));
@@ -303,10 +311,11 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
     _kernel_dims                       = kernel_dims; // Only needed by the Tuner
     _conv_info                         = conv_info;   // Only needed by the Tuner
+    _num_groups                        = num_groups;
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                    im2col_config.is_padding_required_nchw);
+                                                    im2col_config.is_padding_required_nchw, num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -322,12 +331,13 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
 }
 
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+                                unsigned int num_groups)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
+    Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation, num_groups);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                              im2col_config.is_padding_required_nchw)
+                                                              im2col_config.is_padding_required_nchw, num_groups)
                                 .first);
     return Status{};
 }
@@ -337,8 +347,6 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const DataLayout data_layout = _input->info()->data_layout();
-
     // Get initial windows
     // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -353,7 +361,7 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
     Window slice_in  = first_slice_3d;
     Window slice_out = window_output.first_slice_window_2D();
 
-    if(data_layout == DataLayout::NHWC)
+    if(_input->info()->data_layout() == DataLayout::NHWC)
     {
         const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
         const int    num_batches = tmp_win[3].end();
@@ -379,13 +387,21 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
+    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+        if(_num_groups == 1)
+        {
+            add_2D_tensor_argument(idx, _output, slice_out);
+        }
+        else
+        {
+            add_3D_tensor_argument(idx, _output, slice_out);
+        }
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 8cb4f4b889..98b1488a9d 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -45,12 +45,13 @@ using namespace arm_compute;
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
+                          bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on NEON");
 
     if(output->total_size() > 0)
     {
@@ -290,13 +291,14 @@ NEIm2ColKernel::NEIm2ColKernel()
 }
 
 void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                               bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
+                               bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
     ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
+    ARM_COMPUTE_UNUSED(num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
 
     const DataLayout   data_layout = input->info()->data_layout();
     const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -378,9 +380,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
 }
 
 Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
+                                bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
     return Status{};
 }
 
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index a8d7058f2a..1ad8531920 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -181,7 +181,6 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
 
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
-    TensorInfo tmp_output_info{};
 
     // Get the GPU target
     const GPUTarget gpu_target = CLScheduler::get().target();
@@ -229,13 +228,12 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     }
 
     // Validate matrix multiply
-    auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target));
 
     if(beta != 0 && c != nullptr)
     {
         // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
     }
 
     return Status{};
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ca6157ef13..26fd906dd1 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -391,7 +391,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
         const Size2D kernel_dims(kernel_width, kernel_height);
 
         // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true);
+        TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true /* num_groups == 1, num_groups */);
 
         auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
 
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 32edf93b63..1814d61e2f 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -32,6 +32,6 @@ using namespace arm_compute;
 void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, false, true);
+    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, false, true);
     _kernel = std::move(k);
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index c2f0283d4e..f1606aa93e 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -118,7 +118,7 @@ void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITenso
 
     // Configure im2col kernel
     _memory_group.manage(&_im2col_output);
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true);
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true);
 
     // Configure matrix multiply kernel
     configure_mm(&_im2col_output, weights, output);
@@ -308,7 +308,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
         ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
         // Validate im2col kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true));
         input_to_use = &im2col_input;
     }
     else
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index df4a040bad..33284470f4 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -233,7 +233,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
         _memory_group.manage(&_im2col_output);
 
         // Configure
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -401,7 +401,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
         im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
         im2col_reshaped_info.set_quantization_info(input->quantization_info());
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, false, false, dilation));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
         gemm_input_to_use = &im2col_reshaped_info;
     }
     else if(append_bias)
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 6b95cb0256..4245b650e2 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -34,16 +34,18 @@ NEIm2Col::NEIm2Col()
 {
 }
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups,
+                         bool is_fully_connected, bool is_flatten)
 {
     _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
-    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
 }
 
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+                          unsigned int num_groups, bool is_fully_connected, bool is_flatten)
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
 }
 
 void NEIm2Col::run()
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 0737bd2f73..80a2541176 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -113,7 +113,7 @@ Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensor
     TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
     TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
     ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
     ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
     ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
-- 
cgit v1.2.1