From 916d1bcee42051721a82cfb46b52855c2fe56646 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Mon, 13 Aug 2018 11:20:41 +0100
Subject: COMPMID-1498 - Enable grouping in CLGEMMConvolutionLayer

Change-Id: I15c7df21773145b03f42b6f78bd7ad2e5b8a5219
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/144126
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 arm_compute/core/CL/kernels/CLCol2ImKernel.h       |   5 +-
 arm_compute/core/CL/kernels/CLIm2ColKernel.h       |   4 +-
 .../core/CL/kernels/CLWeightsReshapeKernel.h       |   8 +-
 arm_compute/core/NEON/kernels/NEIm2ColKernel.h     |   4 +-
 arm_compute/core/utils/misc/ShapeCalculator.h      |   9 +-
 .../runtime/CL/functions/CLConvolutionLayer.h      |   9 +-
 .../runtime/CL/functions/CLGEMMConvolutionLayer.h  |  28 ++---
 .../GLES_COMPUTE/functions/GCConvolutionLayer.h    |  23 +---
 .../runtime/NEON/functions/NEConvolutionLayer.h    |   9 +-
 .../NEON/functions/NEGEMMConvolutionLayer.h        |   6 +-
 src/core/CL/kernels/CLCol2ImKernel.cpp             |   3 +-
 src/core/CL/kernels/CLIm2ColKernel.cpp             |   2 +
 src/core/CL/kernels/CLWeightsReshapeKernel.cpp     |  25 +++--
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |  15 ++-
 .../CL/functions/CLGEMMConvolutionLayer.cpp        |  43 ++++----
 .../GLES_COMPUTE/functions/GCConvolutionLayer.cpp  |   4 +-
 src/runtime/NEON/functions/NEConvolutionLayer.cpp  |   7 +-
 .../NEON/functions/NEGEMMConvolutionLayer.cpp      |  10 +-
 tests/datasets/LargeConvolutionLayerDataset.h      |  38 +++++--
 tests/datasets/SmallConvolutionLayerDataset.h      |  35 ++++++
 tests/validation/CL/ConvolutionLayer.cpp           | 117 +++++++++++++++++++--
 tests/validation/CL/WeightsReshape.cpp             |   4 +-
 .../validation/fixtures/ConvolutionLayerFixture.h  |  17 ++-
 tests/validation/reference/ConvolutionLayer.cpp    |  49 +++++----
 tests/validation/reference/ConvolutionLayer.h      |   2 +-
 25 files changed, 331 insertions(+), 145 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
index 5c047ca091..2a18ae08c8 100644
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
@@ -63,12 +63,11 @@ public:
     CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
     /** Default destructor */
     ~CLCol2ImKernel() = default;
-
     /** Set the input and output of the kernel.
      *
      * @param[in]  input          The input tensor to convert. Data types supported: QASYMM8/F16/F32
      * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
      * @param[in]  convolved_dims Output convolved dimensions.
      * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
      */
@@ -77,7 +76,7 @@ public:
      *
      * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/F16/F32
      * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
+     *                           while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
      * @param[in] convolved_dims Output convolved dimensions.
      * @param[in] num_groups     (Optional) Number of groups when performing a grouped convolution
      *
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index c678f277cb..0647f5dcec 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -76,7 +76,7 @@ public:
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
      * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution
+     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
     void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
                    unsigned int num_groups = 1);
@@ -90,7 +90,7 @@ public:
      * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] has_bias    In case biases are provided expands the matrix with 1.
      * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution
+     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index d3bbbaf470..bdc5792641 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -75,10 +75,10 @@ public:
      *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
      * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
      *                        Data types supported: Same as @p input
-     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution.
+     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                        Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      */
-    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, const unsigned int num_groups = 1);
+    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
      *
      * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
@@ -88,12 +88,12 @@ public:
      *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
      * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
      *                       Data types supported: Same as @p input
-     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution.
+     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                       Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, const unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 37145a38c1..38bdff0e98 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -84,7 +84,7 @@ public:
      * @param[in]  conv_info          Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  has_bias           In case biases are provided expands the matrix with 1.
      * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups         (Optional) Number of groups when performing a grouped convolution
+     * @param[in]  num_groups         (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      * @param[in]  is_fully_connected (Optional) Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments
      * @param[in]  is_flatten         (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments
      */
@@ -100,7 +100,7 @@ public:
      * @param[in] conv_info          Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] has_bias           In case biases are provided expands the matrix with 1.
      * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups         (Optional) Number of groups when performing a grouped convolution
+     * @param[in] num_groups         (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      * @param[in] is_fully_connected (Optional)Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments
      * @param[in] is_flatten         (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments
      *
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 8a00c22306..c40e7119b2 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -55,19 +55,20 @@ inline TensorShape compute_permutation_output_shape(const ITensorInfo &input, co
     permute(output_shape, perm);
     return output_shape;
 }
-inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, const unsigned int num_groups = 1)
+inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
 {
     // Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
     ARM_COMPUTE_ERROR_ON(weights.data_layout() == DataLayout::NHWC && num_groups > 1);
+    ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
 
     // Calculate output shape
     TensorShape weights_reshaped{ weights.tensor_shape() };
+    weights_reshaped.set(3, weights_reshaped[3] / num_groups);
+
     weights_reshaped.collapse(3);
     const size_t tmp_dim = weights_reshaped[0];
-    weights_reshaped.set(0, weights_reshaped[1] / num_groups);
+    weights_reshaped.set(0, weights_reshaped[1]);
     weights_reshaped.set(1, tmp_dim + (has_bias ? 1 : 0));
     if(weights.num_dimensions() < 5)
     {
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index c9a74f2a4c..8270e9723e 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -60,10 +60,11 @@ public:
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                                available which may introduce a drop of accuracy as well. Default is false
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
     void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -79,11 +80,13 @@ public:
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
+                           unsigned int num_groups = 1);
     /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 8538d83c2b..c6ae535713 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -59,22 +59,24 @@ public:
     CLConvolutionLayerReshapeWeights();
     /** Set the input and output tensors.
      *
-     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                     Data type supported: QASYMM8/F16/F32.
-     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[out] output  Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                        Data type supported: QASYMM8/F16/F32.
+     * @param[in]  biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output     Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output);
+    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayerReshapeWeights
      *
-     * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                    Data type supported: QASYMM8/F16/F32.
-     * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output  Destination tensor. Data types supported: Same as @p weights.
+     * @param[in] weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                       Data type supported: QASYMM8/F16/F32.
+     * @param[in] biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[in] output     Destination tensor. Data types supported: Same as @p weights.
+     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
     // Inherited methods overridden:
     void run() override;
 
@@ -122,9 +124,10 @@ public:
      *                          tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
     void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -140,11 +143,12 @@ public:
      *                          tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
index 421150e18e..4fac95e72f 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
@@ -101,29 +101,10 @@ public:
      *                          tensor has also been transposed with GCGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
     void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCConvolutionLayer.
-     *
-     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                          Data types supported: Same as @p input.
-     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with GCWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with GCGEMMTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index c4226cbc5d..5b53aec9e6 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -64,10 +64,11 @@ public:
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                                available which may introduce a drop of accuracy as well. Default is false
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
     void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -85,11 +86,13 @@ public:
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
+                           unsigned int num_groups = 1);
     /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e587cb4e6f..dc00b17efc 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -116,9 +116,10 @@ public:
      *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
     void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
      * @param[in] input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -134,11 +135,12 @@ public:
      *                         tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in] dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in] act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index d7582dc943..40032f97c4 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -119,7 +119,8 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p
     _config_id = "col2im_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
     _config_id += "_";
-    _config_id += (num_groups > 1) ? "grouping_" : "";
+    _config_id += support::cpp11::to_string(num_groups);
+    _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(1));
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 3d71567eee..0ba0d0e2f9 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -324,6 +324,8 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
     _config_id += "_";
+    _config_id += support::cpp11::to_string(num_groups);
+    _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 5ef0f5b152..7639a48a5f 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -38,7 +38,7 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, const unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -75,12 +75,12 @@ CLWeightsReshapeKernel::CLWeightsReshapeKernel()
 {
 }
 
-void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, const unsigned int num_groups)
+void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr))));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr), num_groups)));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
@@ -102,15 +102,6 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts.options()));
 
-    // Set static arguments
-    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
-    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
-    _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes().z());
-
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps());
     // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -118,7 +109,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
     ICLKernel::configure_internal(win);
 }
 
-Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, const unsigned int num_groups)
+Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output, num_groups));
     return Status{};
@@ -138,6 +129,14 @@ void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
     Window biases_window;
     Window biases_slice;
 
+    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+    idx += (_biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
+    _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes().z());
+
     if(_biases != nullptr)
     {
         biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index d36cce6505..0014e71734 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -43,17 +43,18 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
 }
 
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math));
+                                                            enable_fast_math, num_groups));
 
     switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
                                                       weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         {
+            ARM_COMPUTE_ERROR_ON(num_groups != 1);
             auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
             f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _function = std::move(f);
@@ -61,6 +62,7 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
         }
         case ConvolutionMethod::DIRECT:
         {
+            ARM_COMPUTE_ERROR_ON(num_groups != 1);
             auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
             f->configure(input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
@@ -69,7 +71,7 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
             _function = std::move(f);
             break;
         }
@@ -80,9 +82,10 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
@@ -91,19 +94,21 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
         case ConvolutionMethod::WINOGRAD:
         {
             //Validate Winograd
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLWinogradConvolutionLayer is not supported");
             ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLDirectConvolutionLayer is not supported");
             ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             // Validate gemm-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
             break;
         }
         default:
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 1e639d9dff..782fe710e7 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -43,23 +43,24 @@ CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
 {
 }
 
-void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output)
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(),
                                                                           (biases != nullptr) ? biases->info() : nullptr,
-                                                                          output->info()));
+                                                                          output->info(),
+                                                                          num_groups));
 
     const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _weights_reshape_kernel.configure(weights, biases_to_use, output);
+    _weights_reshape_kernel.configure(weights, biases_to_use, output, num_groups);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
 
-Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
+Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -78,7 +79,7 @@ Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, co
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
 
-        CLWeightsReshapeKernel::validate(weights, biases, output);
+        CLWeightsReshapeKernel::validate(weights, biases, output, num_groups);
     }
 
     return Status{};
@@ -153,7 +154,7 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
 }
 
 void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -164,7 +165,8 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
                                                                 conv_info,
                                                                 weights_info,
                                                                 dilation,
-                                                                act_info));
+                                                                act_info,
+                                                                num_groups));
 
     const DataType   data_type   = input->info()->data_type();
     const DataLayout data_layout = input->info()->data_layout();
@@ -208,11 +210,11 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
                                                  conv_info,
                                                  dilation);
 
-    unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
+    unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups;
 
     // _weights_reshaped will be auto configured in the kernel.
     // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
-    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, num_groups);
 
     // Create tensor to store im2col reshaped inputs
     if(!_skip_im2col)
@@ -220,7 +222,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
         _memory_group.manage(&_im2col_output);
 
         // Configure and tune im2col. im2col output shape is auto-initialized
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation, num_groups);
 
         // Set quantization info
         _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
@@ -283,7 +285,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
         if(input->info()->data_layout() == DataLayout::NCHW)
         {
             // Configure and tune Col2Im
-            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, std::make_pair(conv_w, conv_h));
+            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, std::make_pair(conv_w, conv_h), num_groups);
             CLScheduler::get().tune_kernel_static(_col2im_kernel);
         }
         else
@@ -314,13 +316,16 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
 }
 
 Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(2) / weights->dimension(2)) != num_groups) && (input->data_layout() == DataLayout::NCHW));
 
     const DataLayout data_layout = input->data_layout();
     const DataType   data_type   = input->data_type();
@@ -343,7 +348,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
     const bool skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
     const bool append_bias  = (biases != nullptr) && (!is_quantized);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     // Validate biases
@@ -377,11 +382,11 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
                                                  conv_info,
                                                  dilation);
 
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+    unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups;
 
     // Output tensor auto inizialitation if not yet initialized
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col)), 1, data_type);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr, num_groups));
+    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col), num_groups), 1, data_type);
     weights_to_use        = &weights_reshaped_info;
 
     if(!skip_im2col)
@@ -389,11 +394,11 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
         const Size2D kernel_dims(kernel_width, kernel_height);
 
         // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true /* num_groups == 1, num_groups */);
+        TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, num_groups == 1, num_groups);
 
         auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
     else if(append_bias)
@@ -438,7 +443,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
                                                                  output,
-                                                                 std::make_pair(conv_w, conv_h)));
+                                                                 std::make_pair(conv_w, conv_h), num_groups));
         }
     }
 
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index 5cfd72f724..c58d184afa 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -88,7 +88,7 @@ Status GCConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorIn
 }
 
 void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
@@ -96,6 +96,8 @@ void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weig
     ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON(num_groups > 1);
+    ARM_COMPUTE_UNUSED(num_groups);
 
     _is_prepared      = false;
     _original_weights = weights;
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index d71fd5a715..931e5db65b 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -42,10 +42,11 @@ NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
 }
 
 void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
                                                             enable_fast_math));
 
@@ -79,8 +80,10 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
 }
 
 Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON");
+
     switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
     {
         case ConvolutionMethod::WINOGRAD:
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 52b461e255..b76cf6aa10 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -167,10 +167,10 @@ Status NEGEMMConvolutionLayer::validate_gemm3d(DataType data_type, int gemm_3d_d
 }
 
 void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
+    ARM_COMPUTE_UNUSED(num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
                                                                 weights->info(),
                                                                 biases != nullptr ? biases->info() : nullptr,
@@ -178,7 +178,8 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
                                                                 conv_info,
                                                                 weights_info,
                                                                 dilation,
-                                                                act_info));
+                                                                act_info,
+                                                                num_groups));
 
     const DataType   data_type   = input->info()->data_type();
     const DataLayout data_layout = input->info()->data_layout();
@@ -346,13 +347,14 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
 
     const DataLayout data_layout = input->data_layout();
     const DataType   data_type   = input->data_type();
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 3eb98dbeea..170d562f6c 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -166,31 +166,51 @@ public:
         // Batch size 1
         add_config(TensorShape(227U, 227U, 3U), TensorShape(11U, 11U, 3U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U), PadStrideInfo(4, 4, 0, 0));
         add_config(TensorShape(27U, 27U, 96U), TensorShape(5U, 5U, 96U, 256U), TensorShape(256U), TensorShape(27U, 27U, 256U), PadStrideInfo(1, 1, 2, 2));
-        add_config(TensorShape(13U, 13U, 256U), TensorShape(3U, 3U, 256U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(13U, 13U, 384U), TensorShape(3U, 3U, 384U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(13U, 13U, 384U), TensorShape(3U, 3U, 384U, 256U), TensorShape(256U), TensorShape(13U, 13U, 256U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(13U, 13U, 256U), TensorShape(1U, 1U, 256U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(13U, 13U, 384U), TensorShape(1U, 1U, 384U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(224U, 224U, 3U), TensorShape(7U, 7U, 3U, 64U), TensorShape(64U), TensorShape(112U, 112U, 64U), PadStrideInfo(2, 2, 3, 3));
         add_config(TensorShape(28U, 28U, 256U), TensorShape(1U, 1U, 256U, 64U), TensorShape(64U), TensorShape(28U, 28U, 64U), PadStrideInfo(1, 1, 0, 0));
         // Batch size 4
         add_config(TensorShape(227U, 227U, 3U, 4U), TensorShape(11U, 11U, 3U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U, 4U), PadStrideInfo(4, 4, 0, 0));
         add_config(TensorShape(27U, 27U, 96U, 4U), TensorShape(5U, 5U, 96U, 256U), TensorShape(256U), TensorShape(27U, 27U, 256U, 4U), PadStrideInfo(1, 1, 2, 2));
-        add_config(TensorShape(13U, 13U, 256U, 4U), TensorShape(3U, 3U, 256U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 4U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(13U, 13U, 384U, 4U), TensorShape(3U, 3U, 384U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 4U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(13U, 13U, 384U, 4U), TensorShape(3U, 3U, 384U, 256U), TensorShape(256U), TensorShape(13U, 13U, 256U, 4U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(13U, 13U, 256U, 4U), TensorShape(1U, 1U, 256U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 4U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(13U, 13U, 384U, 4U), TensorShape(1U, 1U, 384U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 4U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(224U, 224U, 3U, 4U), TensorShape(7U, 7U, 3U, 64U), TensorShape(64U), TensorShape(112U, 112U, 64U, 4U), PadStrideInfo(2, 2, 3, 3));
         add_config(TensorShape(28U, 28U, 256U, 4U), TensorShape(1U, 1U, 256U, 64U), TensorShape(64U), TensorShape(28U, 28U, 64U, 4U), PadStrideInfo(1, 1, 0, 0));
         // Batch size 8
         add_config(TensorShape(227U, 227U, 3U, 8U), TensorShape(11U, 11U, 3U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U, 8U), PadStrideInfo(4, 4, 0, 0));
         add_config(TensorShape(27U, 27U, 96U, 8U), TensorShape(5U, 5U, 96U, 256U), TensorShape(256U), TensorShape(27U, 27U, 256U, 8U), PadStrideInfo(1, 1, 2, 2));
-        add_config(TensorShape(13U, 13U, 256U, 8U), TensorShape(3U, 3U, 256U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 8U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(13U, 13U, 384U, 8U), TensorShape(3U, 3U, 384U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 8U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(13U, 13U, 384U, 8U), TensorShape(3U, 3U, 384U, 256U), TensorShape(256U), TensorShape(13U, 13U, 256U, 8U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(13U, 13U, 256U, 8U), TensorShape(1U, 1U, 256U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 8U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(13U, 13U, 384U, 8U), TensorShape(1U, 1U, 384U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 8U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(224U, 224U, 3U, 8U), TensorShape(7U, 7U, 3U, 64U), TensorShape(64U), TensorShape(112U, 112U, 64U, 8U), PadStrideInfo(2, 2, 3, 3));
         add_config(TensorShape(28U, 28U, 256U, 8U), TensorShape(1U, 1U, 256U, 64U), TensorShape(64U), TensorShape(28U, 28U, 64U, 8U), PadStrideInfo(1, 1, 0, 0));
         // Arbitrary batch size
         add_config(TensorShape(227U, 227U, 3U, 5U), TensorShape(11U, 11U, 3U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U, 5U), PadStrideInfo(4, 4, 0, 0));
     }
 };
+
+class LargeGroupedConvolutionLayerDataset final : public ConvolutionLayerDataset
+{
+public:
+    LargeGroupedConvolutionLayerDataset()
+    {
+        // Batch size 1
+        add_config(TensorShape(227U, 227U, 4U), TensorShape(11U, 11U, 2U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U), PadStrideInfo(4, 4, 0, 0));
+        add_config(TensorShape(27U, 27U, 96U), TensorShape(5U, 5U, 24U, 256U), TensorShape(256U), TensorShape(27U, 27U, 256U), PadStrideInfo(1, 1, 2, 2));
+        add_config(TensorShape(13U, 13U, 256U), TensorShape(1U, 1U, 128U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(13U, 13U, 384U), TensorShape(3U, 3U, 128U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 1, 1));
+        // Batch size 4
+        add_config(TensorShape(227U, 227U, 4U, 4U), TensorShape(11U, 11U, 2U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U, 4U), PadStrideInfo(4, 4, 0, 0));
+        add_config(TensorShape(27U, 27U, 96U, 4U), TensorShape(5U, 5U, 24U, 256U), TensorShape(256U), TensorShape(27U, 27U, 256U, 4U), PadStrideInfo(1, 1, 2, 2));
+        add_config(TensorShape(13U, 13U, 256U, 4U), TensorShape(3U, 3U, 128U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 4U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(13U, 13U, 384U, 4U), TensorShape(3U, 3U, 128U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 4U), PadStrideInfo(1, 1, 1, 1));
+        // Batch size 8
+        add_config(TensorShape(227U, 227U, 4U, 8U), TensorShape(11U, 11U, 2U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U, 8U), PadStrideInfo(4, 4, 0, 0));
+        add_config(TensorShape(27U, 27U, 96U, 8U), TensorShape(5U, 5U, 24U, 256U), TensorShape(256U), TensorShape(27U, 27U, 256U, 8U), PadStrideInfo(1, 1, 2, 2));
+        add_config(TensorShape(13U, 13U, 256U, 8U), TensorShape(3U, 3U, 128U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 8U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(13U, 13U, 384U, 8U), TensorShape(3U, 3U, 128U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U, 8U), PadStrideInfo(1, 1, 1, 1));
+    }
+};
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index ae12dd4b16..a288d07902 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -146,6 +146,41 @@ public:
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
     }
 };
+
+class SmallGroupedConvolutionLayerDataset final : public ConvolutionLayerDataset
+{
+public:
+    SmallGroupedConvolutionLayerDataset()
+    {
+        // Batch size 1
+        // Number of groups = 2
+        add_config(TensorShape(23U, 27U, 8U), TensorShape(1U, 1U, 4U, 24U), TensorShape(24U), TensorShape(12U, 27U, 24U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 12U), TensorShape(5U, 5U, 6U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U), PadStrideInfo(3, 2, 1, 0));
+        // Number of groups = 4
+        add_config(TensorShape(23U, 27U, 8U), TensorShape(1U, 1U, 2U, 24U), TensorShape(24U), TensorShape(12U, 27U, 24U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 12U), TensorShape(5U, 5U, 4U, 15U), TensorShape(15U), TensorShape(11U, 12U, 15U), PadStrideInfo(3, 2, 1, 0));
+
+        // Batch size 4
+        // Number of groups = 2
+        add_config(TensorShape(23U, 27U, 8U, 4U), TensorShape(1U, 1U, 4U, 24U), TensorShape(24U), TensorShape(12U, 27U, 24U, 4U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 12U, 4U), TensorShape(5U, 5U, 6U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 4U), PadStrideInfo(3, 2, 1, 0));
+        // Number of groups = 4
+        add_config(TensorShape(23U, 27U, 8U, 4U), TensorShape(1U, 1U, 2U, 24U), TensorShape(24U), TensorShape(12U, 27U, 24U, 4U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 12U, 4U), TensorShape(5U, 5U, 4U, 15U), TensorShape(15U), TensorShape(11U, 12U, 15U, 4U), PadStrideInfo(3, 2, 1, 0));
+
+        // Arbitrary batch size
+        add_config(TensorShape(23U, 27U, 8U, 5U), TensorShape(1U, 1U, 4U, 24U), TensorShape(24U), TensorShape(12U, 27U, 24U, 5U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 12U, 3U), TensorShape(5U, 5U, 6U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 3U), PadStrideInfo(3, 2, 1, 0));
+        // Number of groups = 4
+        add_config(TensorShape(23U, 27U, 8U, 2U), TensorShape(1U, 1U, 2U, 24U), TensorShape(24U), TensorShape(12U, 27U, 24U, 2U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 12U, 5U), TensorShape(5U, 5U, 4U, 15U), TensorShape(15U), TensorShape(11U, 12U, 15U, 5U), PadStrideInfo(3, 2, 1, 0));
+
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 8U, 5U), TensorShape(5U, 7U, 2U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 8U, 5U), TensorShape(5U, 7U, 4U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 6U, 5U), TensorShape(5U, 7U, 3U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
+    }
+};
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 54fdc0c386..5c96cd4c59 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -58,6 +58,14 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
     DataType::QASYMM8,
 });
+
+/** Grouped CNN data types */
+const auto GroupedCNNDataTypes = framework::dataset::make("DataType",
+{
+    DataType::F16,
+    DataType::F32
+});
+
 const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
@@ -219,7 +227,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerFixture<half>, framework:
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 
@@ -244,8 +252,8 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerFixture<float>, framework
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, absolute_tolerance_float);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
 
 template <typename T>
 using CLGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
@@ -280,11 +288,106 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // GEMMConvolutionLayer
+
+template <typename T>
+using CLGEMMGroupedConvolutionLayerFixture = ConvolutionValidationFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
+
+TEST_SUITE(GroupedGEMMConvolutionLayer)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallGroupedConvolutionLayerDataset(), datasets::LargeGroupedConvolutionLayerDataset()),
+                                                                           GroupedCNNDataTypes),
+                                                                   ActivationFunctionsDataset),
+               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type, act_info)
+{
+    ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0);
+
+    // The number of groups is calculated dividing the number of input channels of the input tensor by the number of input channels of the weights shape
+    const int num_groups = input_shape[2] / weights_shape[2];
+
+    // Create tensors
+    CLTensor src     = create_tensor<CLTensor>(input_shape, data_type);
+    CLTensor weights = create_tensor<CLTensor>(weights_shape, data_type, 1);
+    CLTensor bias    = create_tensor<CLTensor>(bias_shape, data_type, 1);
+    CLTensor dst     = create_tensor<CLTensor>(output_shape, data_type, 1);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLGEMMConvolutionLayer conv;
+    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation, act_info, num_groups);
+
+    // Validate valid region
+    const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
+    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
+    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
+    const ValidRegion dst_valid_region     = shape_to_valid_region(output_shape);
+
+    validate(src.info()->valid_region(), src_valid_region);
+    validate(weights.info()->valid_region(), weights_valid_region);
+    validate(bias.info()->valid_region(), bias_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+
+    // Validate padding
+    //TODO(COMPMID-415) Need to validate padding?
+}
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
+                       framework::dataset::make("ReshapeWeights", { true })),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                       ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeGroupedConvolutionLayerDataset(),
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                       ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                        ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeGroupedConvolutionLayerDataset(),
+                                                                                                                      framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                      ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // GroupedGEMMConvolutionLayer
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/WeightsReshape.cpp b/tests/validation/CL/WeightsReshape.cpp
index 6dae0c7625..30c231d499 100644
--- a/tests/validation/CL/WeightsReshape.cpp
+++ b/tests/validation/CL/WeightsReshape.cpp
@@ -79,7 +79,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWeightsReshapeFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedWeightsSmallShapes(), framework::dataset::make("DataType",
                                                                                                                       DataType::F32)),
                                                                                                               framework::dataset::make("HasBias", { true, false })),
-                                                                                                      framework::dataset::make("NumGroups", { 1, 2, 3 })))
+                                                                                                      framework::dataset::make("NumGroups", { 1, 2, 3, 4 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -87,7 +87,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWeightsReshapeFixture<float>, framework::Data
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWeightsReshapeFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedWeightsLargeShapes(), framework::dataset::make("DataType",
                                                                                                                   DataType::F32)),
                                                                                                                   framework::dataset::make("HasBias", { true, false })),
-                                                                                                          framework::dataset::make("NumGroups", { 1, 2, 3 })))
+                                                                                                          framework::dataset::make("NumGroups", { 1, 2, 3, 4 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index 4a6326480c..3b420eac09 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -102,6 +102,10 @@ protected:
     TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
                               bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info)
     {
+        ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0);
+
+        const unsigned int num_groups = input_shape[2] / weights_shape[2];
+
         if(_data_layout == DataLayout::NHWC)
         {
             permute(input_shape, PermutationVector(2U, 0U, 1U));
@@ -123,7 +127,7 @@ protected:
 
         // Create and configure function
         FunctionType conv;
-        conv.configure(&src, &weights, &bias, &dst, info, weights_info, dilation, act_info);
+        conv.configure(&src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -155,6 +159,10 @@ protected:
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
                                       const Size2D &dilation, const ActivationLayerInfo act_info)
     {
+        ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0);
+
+        const unsigned int num_groups = input_shape[2] / weights_shape[2];
+
         // Create reference
         SimpleTensor<T>     src{ input_shape, _data_type, 1, _quantization_info };
         SimpleTensor<T>     weights{ weights_shape, _data_type, 1, _quantization_info };
@@ -165,9 +173,9 @@ protected:
         fill(weights, 1);
         fill(bias, 2);
 
-        return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation),
+        return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups),
                                                                      act_info) :
-               reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation);
+               reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups);
     }
 
     TensorType       _target{};
@@ -187,7 +195,8 @@ public:
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
                DataLayout data_layout, ActivationLayerInfo act_info)
     {
-        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, data_type, data_layout,
+        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
+                                                                                              data_type, data_layout,
                                                                                               QuantizationInfo(), act_info);
     }
 };
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 2d314059dd..f41a6fc8c4 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -47,8 +47,10 @@ namespace
 
 template <typename T, typename TB>
 SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
-                                       const Size2D &dilation)
+                                       const Size2D &dilation, unsigned int num_groups)
 {
+    ARM_COMPUTE_ERROR_ON((src.shape()[2] / num_groups) != weights.shape()[2]);
+
     // Compute reference
     const int width_in       = src.shape().x();
     const int height_in      = src.shape().y();
@@ -78,23 +80,28 @@ SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleT
         {
             for(int xi = start_xi; xi < start_xi + end_xi; xi += stride_xi)
             {
-                for(int ofm = 0; ofm < depth_out; ++ofm)
+                for(int group = 0; group < static_cast<int>(num_groups); ++group)
                 {
-                    // Compute input and output offsets
-                    const int offset_in  = r * width_in * height_in * depth_in;
-                    const int xo         = (xi - start_xi) / stride_xi;
-                    const int yo         = (yi - start_yi) / stride_yi;
-                    const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
+                    for(int ofm = 0; ofm < static_cast<int>(depth_out / num_groups); ++ofm)
+                    {
+                        // Compute input and output offsets
+                        const int offset_in  = r * width_in * height_in * depth_in + (group * (depth_in / num_groups) * width_in * height_in);
+                        const int xo         = (xi - start_xi) / stride_xi;
+                        const int yo         = (yi - start_yi) / stride_yi;
+                        const int offset_out = xo + yo * width_out + ((ofm + group * (depth_out / num_groups)) * width_out * height_out) + (r * width_out * height_out * depth_out);
+                        const int offset_w   = (ofm + group * (depth_out / num_groups)) * width_weights * height_weights * depth_weights;
+                        const int offset_b   = (ofm + group * (depth_out / num_groups));
 
-                    ARM_COMPUTE_ASSERT(xo < width_out);
-                    ARM_COMPUTE_ASSERT(yo < height_out);
+                        ARM_COMPUTE_ASSERT(xo < width_out);
+                        ARM_COMPUTE_ASSERT(yo < height_out);
 
-                    // Compute 3D convolution
-                    convolution_3d::detail::convolution3d(src, weights, bias, dst,
-                                                          offset_in, ofm * width_weights * height_weights * depth_weights, ofm, offset_out,
-                                                          xi, yi,
-                                                          width_in, height_in, depth_in,
-                                                          width_weights, height_weights, dilation.x(), dilation.y());
+                        // Compute 3D convolution
+                        convolution_3d::detail::convolution3d(src, weights, bias, dst,
+                                                              offset_in, offset_w, offset_b, offset_out,
+                                                              xi, yi,
+                                                              width_in, height_in, (depth_in / num_groups),
+                                                              width_weights, height_weights, dilation.x(), dilation.y());
+                    }
                 }
             }
         }
@@ -104,7 +111,7 @@ SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleT
 }
 template <typename T, typename TB>
 SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
-                                  const Size2D &dilation)
+                                  const Size2D &dilation, unsigned int num_groups)
 {
     // Create reference
     SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.quantization_info() };
@@ -115,20 +122,20 @@ SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor
         SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
         SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
 
-        return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation), PermutationVector(2U, 0U, 1U));
+        return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation, num_groups), PermutationVector(2U, 0U, 1U));
     }
     else
     {
-        return convolution_layer_nchw(src, weights, bias, dst, info, dilation);
+        return convolution_layer_nchw(src, weights, bias, dst, info, dilation, num_groups);
     }
 }
 
 template SimpleTensor<float> convolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
-                                               const PadStrideInfo &info, const Size2D &dilation);
+                                               const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups);
 template SimpleTensor<half> convolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
-                                              const PadStrideInfo &info, const Size2D &dilation);
+                                              const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups);
 template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info, const Size2D &dilation);
+                                                 const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h
index ff3b1531f4..ccce53a209 100644
--- a/tests/validation/reference/ConvolutionLayer.h
+++ b/tests/validation/reference/ConvolutionLayer.h
@@ -37,7 +37,7 @@ namespace reference
 {
 template <typename T, typename TB>
 SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
-                                  const Size2D &dilation = Size2D(1U, 1U));
+                                  const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
 } // namespace reference
 } // namespace validation
 } // namespace test
-- 
cgit v1.2.1