From 4a578b923ed000c67fe0bc1433f945aea634ca9c Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 25 Jun 2021 12:13:49 +0100
Subject: Port the ClGemmLowp kernels to the new API

Ported kernels:
 - CLGEMMLowpMatrixMultiplyNativeKernel
 - CLGEMMLowpMatrixMultiplyReshapedKernel
 - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
 - CLGEMMLowpOffsetContributionKernel
 - CLGEMMLowpOffsetContributionOutputStageKernel
 - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
 - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
 - CLGEMMLowpQuantizeDownInt32ScaleKernel

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I9d5a744d6a2dd2f2726fdfb291bad000b6970de2
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5870
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |  18 +-
 arm_compute/core/experimental/Types.h              |  39 +-
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h    |  31 +-
 .../runtime/CL/functions/CLGEMMLowpOutputStage.h   | 260 +---------
 .../runtime/CL/functions/CLLSTMLayerQuantized.h    |  68 +--
 arm_compute/runtime/CL/functions/CLQLSTMLayer.h    | 140 ++---
 docs/user_guide/release_version_and_change_log.dox |  44 +-
 examples/gemm_tuner/cl_gemmlowp_reshaped.cpp       |  19 +-
 ...aped_rhs_only_fused_output_stage_fixedpoint.cpp |  30 +-
 filelist.json                                      |  78 +--
 src/core/CL/CLKernels.h                            |   9 -
 .../CLGEMMLowpMatrixMultiplyNativeKernel.cpp       | 335 ------------
 .../kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h | 108 ----
 .../CLGEMMLowpMatrixMultiplyReshapedKernel.cpp     | 298 -----------
 .../CLGEMMLowpMatrixMultiplyReshapedKernel.h       | 125 -----
 ...GEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp | 574 ---------------------
 ...CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h | 155 ------
 .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 215 --------
 .../kernels/CLGEMMLowpOffsetContributionKernel.h   | 116 -----
 ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 274 ----------
 ...CLGEMMLowpOffsetContributionOutputStageKernel.h | 136 -----
 ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp | 153 ------
 ...MLowpQuantizeDownInt32ScaleByFixedPointKernel.h |  89 ----
 ...GEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp | 160 ------
 ...CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h | 101 ----
 .../CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp     | 158 ------
 .../CLGEMMLowpQuantizeDownInt32ScaleKernel.h       | 102 ----
 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp  | 221 --------
 src/core/CL/kernels/CLGEMMLowpReductionKernel.h    | 176 -------
 .../ClGemmLowpMatrixMultiplyNativeKernel.cpp       | 335 ++++++++++++
 .../kernels/ClGemmLowpMatrixMultiplyNativeKernel.h |  81 +++
 .../ClGemmLowpMatrixMultiplyReshapedKernel.cpp     | 300 +++++++++++
 .../ClGemmLowpMatrixMultiplyReshapedKernel.h       |  90 ++++
 ...GemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp | 544 +++++++++++++++++++
 ...ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h | 100 ++++
 .../kernels/ClGemmLowpOffsetContributionKernel.cpp | 212 ++++++++
 .../kernels/ClGemmLowpOffsetContributionKernel.h   |  86 +++
 ...GemmLowpOffsetContributionOutputStageKernel.cpp | 263 ++++++++++
 ...ClGemmLowpOffsetContributionOutputStageKernel.h |  90 ++++
 ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp | 160 ++++++
 ...mLowpQuantizeDownInt32ScaleByFixedPointKernel.h |  78 +++
 ...GemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp | 160 ++++++
 ...ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h |  80 +++
 .../ClGemmLowpQuantizeDownInt32ScaleKernel.cpp     | 157 ++++++
 .../ClGemmLowpQuantizeDownInt32ScaleKernel.h       |  80 +++
 .../gpu/cl/kernels/ClGemmLowpReductionKernel.cpp   | 219 ++++++++
 .../gpu/cl/kernels/ClGemmLowpReductionKernel.h     | 124 +++++
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp |   5 -
 .../CL/functions/CLGEMMConvolutionLayer.cpp        |   5 -
 .../CL/functions/CLGEMMDeconvolutionLayer.cpp      |   5 -
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp  | 157 ++++--
 src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 138 ++---
 src/runtime/CL/functions/CLLSTMLayer.cpp           |   5 -
 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp  |  20 +-
 src/runtime/CL/functions/CLQLSTMLayer.cpp          | 100 ++--
 src/runtime/CL/functions/CLRNNLayer.cpp            |   5 -
 tests/validation/CL/GEMMLowp.cpp                   | 134 -----
 .../validation/CL/GEMMLowpMatrixMultiplyNative.cpp |   6 +-
 .../CL/GEMMLowpMatrixMultiplyReshaped.cpp          |   4 +-
 .../CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp   |   6 +-
 tests/validation/fixtures/GEMMLowpFixture.h        |  30 +-
 61 files changed, 3639 insertions(+), 4372 deletions(-)
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h

diff --git a/Android.bp b/Android.bp
index 6250539d02..822afc1eca 100644
--- a/Android.bp
+++ b/Android.bp
@@ -96,15 +96,6 @@ cc_library_static {
         "src/core/CL/kernels/CLFFTScaleKernel.cpp",
         "src/core/CL/kernels/CLFillBorderKernel.cpp",
         "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp",
         "src/core/CL/kernels/CLGatherKernel.cpp",
         "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp",
         "src/core/CL/kernels/CLIm2ColKernel.cpp",
@@ -351,6 +342,15 @@ cc_library_static {
         "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
         "src/core/gpu/cl/kernels/ClFillKernel.cpp",
         "src/core/gpu/cl/kernels/ClFloorKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp",
+        "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp",
         "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp",
         "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp",
         "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp",
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index 92ece460dc..1983344075 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -39,15 +39,24 @@ enum TensorType : int32_t
 {
     ACL_UNKNOWN = -1,
     ACL_SRC_DST = 0,
-    ACL_SRC     = 0,
-    ACL_SRC_0   = 0,
-    ACL_SRC_1   = 1,
-    ACL_SRC_2   = 2,
-    ACL_DST     = 30,
-    ACL_DST_0   = 30,
-    ACL_DST_1   = 31,
-    ACL_DST_2   = 32,
-    ACL_BIAS    = ACL_SRC_2,
+
+    // Src
+    ACL_SRC   = 0,
+    ACL_SRC_0 = 0,
+    ACL_SRC_1 = 1,
+    ACL_SRC_2 = 2,
+    ACL_SRC_3 = 3,
+    ACL_SRC_4 = 4,
+    ACL_SRC_5 = 5,
+    ACL_SRC_6 = 6,
+
+    // Dst
+    ACL_DST   = 30,
+    ACL_DST_0 = 30,
+    ACL_DST_1 = 31,
+    ACL_DST_2 = 32,
+
+    // Aux
     ACL_INT     = 50,
     ACL_INT_0   = 50,
     ACL_INT_1   = 51,
@@ -56,7 +65,17 @@ enum TensorType : int32_t
     ACL_INT_4   = 54,
     ACL_SRC_VEC = 256,
     ACL_DST_VEC = 512,
-    ACL_INT_VEC = 1024
+    ACL_INT_VEC = 1024,
+
+    // Aliasing Types
+    // Conv etc
+    ACL_BIAS = ACL_SRC_2,
+
+    // Gemm
+    ACL_VEC_ROW_SUM = ACL_SRC_3,
+    ACL_VEC_COL_SUM = ACL_SRC_4,
+    ACL_SHIFTS      = ACL_SRC_5,
+    ACL_MULTIPLIERS = ACL_SRC_6,
 };
 
 namespace experimental
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 3d2dbdb104..e62db8e644 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -34,17 +34,17 @@ class CLCompileContext;
 class IMemoryManager;
 class ICLTensor;
 class ITensorInfo;
-class CLGEMMLowpMatrixMultiplyNativeKernel;
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel;
-class CLGEMMLowpOffsetContributionKernel;
-class CLGEMMLowpOffsetContributionOutputStageKernel;
-class CLGEMMLowpMatrixAReductionKernel;
-class CLGEMMLowpMatrixBReductionKernel;
 namespace opencl
 {
 namespace kernels
 {
 class ClGemmReshapeRhsMatrixKernel;
+class ClGemmLowpMatrixMultiplyNativeKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel;
+class ClGemmLowpOffsetContributionKernel;
+class ClGemmLowpOffsetContributionOutputStageKernel;
+class ClGemmLowpMatrixAReductionKernel;
+class ClGemmLowpMatrixBReductionKernel;
 } // namespace kernels
 } // namespace opencl
 
@@ -150,14 +150,14 @@ private:
     MemoryGroup _memory_group;
 
     // Kernels used
-    std::unique_ptr<opencl::kernels::ClCastKernel>                 _weights_to_qasymm8;
-    std::unique_ptr<CLGEMMLowpMatrixMultiplyNativeKernel>          _mm_native_kernel;
-    std::unique_ptr<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel> _mm_reshaped_only_rhs_kernel;
-    std::unique_ptr<opencl::kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<CLGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<CLGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<CLGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+    std::unique_ptr<opencl::kernels::ClCastKernel>                                  _weights_to_qasymm8;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixMultiplyNativeKernel>          _mm_native_kernel;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<opencl::kernels::ClGemmReshapeRhsMatrixKernel>                  _mtx_b_reshape_kernel;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
+    std::unique_ptr<opencl::kernels::ClGemmLowpOffsetContributionKernel>            _offset_contribution_kernel;
+    std::unique_ptr<opencl::kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
 
     // Temporary tensors
     CLTensor _qasymm8_weights;
@@ -171,7 +171,8 @@ private:
     // Tensor pointers
     const ICLTensor *_matrix_a;
     const ICLTensor *_original_b;
-    const ICLTensor *_output;
+    const ICLTensor *_c;
+    ICLTensor       *_output;
 
     int32_t _a_offset;
     int32_t _b_offset;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index a60992a0f4..e85f2db8a9 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -45,247 +45,28 @@ class ICLTensor;
 class ITensorInfo;
 struct GEMMLowpOutputStageInfo;
 
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                   int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                   int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following CL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
 /** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
  *
  *  This function calls the following CL kernels:
  *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
 */
-class CLGEMMLowpOutputStage : public ICLSimpleFunction
+class CLGEMMLowpOutputStage : public IFunction
 {
 public:
+    CLGEMMLowpOutputStage();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpOutputStage(const CLGEMMLowpOutputStage &) = delete;
+    /** Default move constructor */
+    CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpOutputStage &operator=(const CLGEMMLowpOutputStage &) = delete;
+    /** Default move assignment operator */
+    CLGEMMLowpOutputStage &operator=(CLGEMMLowpOutputStage &&);
+    /** Default destructor */
+    ~CLGEMMLowpOutputStage();
     /** Initialise the kernel's inputs, output
      *
      * Valid data layouts:
@@ -315,7 +96,7 @@ public:
      * @param[in]  info            GEMMLowp output stage metadata.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
      *
      * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
      * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
@@ -326,6 +107,15 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<ICLKernel> _kernel;
+    const ICLTensor           *_input;
+    const ICLTensor           *_bias;
+    ICLTensor                 *_output;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 2ef7427a5a..9c004b85d0 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -47,16 +47,16 @@ class ICLTensor;
  *
  * This function calls the following CL functions/kernels:
  *
- * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
- * -# @ref CLTranspose                                           Matrix transpose
- * -# @ref CLConcatenateLayer                                    Tensor concatenation
- * -# @ref CLActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref CLArithmeticAddition                                  Elementwise addition
- * -# @ref CLPixelWiseMultiplication                             Elementwise multiplication
- * -# @ref CLSlice                                               Tensor slicing
- * -# @ref CLDequantizationLayer                                 Dequantize into float
- * -# @ref CLQuantizationLayer                                   Quantize from float
+ * -# @ref CLGEMMLowpMatrixMultiplyCore      Quantized matrix multiplication core. Accumulators are 32-bit integers
+ * -# @ref CLGEMMLowpOutputStage             Convert 32-bit integers into QSYMM16
+ * -# @ref CLTranspose                       Matrix transpose
+ * -# @ref CLConcatenateLayer                Tensor concatenation
+ * -# @ref CLActivationLayer                 Activation functions (tanh and logistic)
+ * -# @ref CLArithmeticAddition              Elementwise addition
+ * -# @ref CLPixelWiseMultiplication         Elementwise multiplication
+ * -# @ref CLSlice                           Tensor slicing
+ * -# @ref CLDequantizationLayer             Dequantize into float
+ * -# @ref CLQuantizationLayer               Quantize from float
  * */
 class CLLSTMLayerQuantized : public IFunction
 {
@@ -170,30 +170,30 @@ private:
     MemoryGroup _memory_group;
 
     // Functions used
-    CLGEMMLowpMatrixMultiplyCore                        _gemmlowp;
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
-    CLTranspose                                         _transpose_weights;
-    CLConcatenateLayer                                  _concat_input_weights;
-    CLConcatenateLayer                                  _concat_recurrent_weights;
-    CLConcatenateLayer                                  _concat_weights;
-    CLConcatenateLayer                                  _concat_inputs;
-    CLConcatenateLayer                                  _concat_bias;
-    CLActivationLayer                                   _sigmoid_forget_gate;
-    CLActivationLayer                                   _sigmoid_input_gate;
-    CLActivationLayer                                   _sigmoid_output_gate;
-    CLActivationLayer                                   _tanh_modulation_gate;
-    CLActivationLayer                                   _tanh_output_state;
-    CLArithmeticAddition                                _add_cell_state_tmps;
-    CLArithmeticAddition                                _add2;
-    CLPixelWiseMultiplication                           _mul_forget_gate_cell_state;
-    CLPixelWiseMultiplication                           _mul_input_gate_input_mod_gate;
-    CLPixelWiseMultiplication                           _mul_output_state_tmp_output_gate;
-    CLSlice                                             _slice_input_tensor;
-    CLSlice                                             _slice_forget_tensor;
-    CLSlice                                             _slice_cell_tensor;
-    CLSlice                                             _slice_output_tensor;
-    CLDequantizationLayer                               _dequantize;
-    CLQuantizationLayer                                 _quantize;
+    CLGEMMLowpMatrixMultiplyCore _gemmlowp;
+    CLGEMMLowpOutputStage        _output_stage;
+    CLTranspose                  _transpose_weights;
+    CLConcatenateLayer           _concat_input_weights;
+    CLConcatenateLayer           _concat_recurrent_weights;
+    CLConcatenateLayer           _concat_weights;
+    CLConcatenateLayer           _concat_inputs;
+    CLConcatenateLayer           _concat_bias;
+    CLActivationLayer            _sigmoid_forget_gate;
+    CLActivationLayer            _sigmoid_input_gate;
+    CLActivationLayer            _sigmoid_output_gate;
+    CLActivationLayer            _tanh_modulation_gate;
+    CLActivationLayer            _tanh_output_state;
+    CLArithmeticAddition         _add_cell_state_tmps;
+    CLArithmeticAddition         _add2;
+    CLPixelWiseMultiplication    _mul_forget_gate_cell_state;
+    CLPixelWiseMultiplication    _mul_input_gate_input_mod_gate;
+    CLPixelWiseMultiplication    _mul_output_state_tmp_output_gate;
+    CLSlice                      _slice_input_tensor;
+    CLSlice                      _slice_forget_tensor;
+    CLSlice                      _slice_cell_tensor;
+    CLSlice                      _slice_output_tensor;
+    CLDequantizationLayer        _dequantize;
+    CLQuantizationLayer          _quantize;
 
     // Tensor pointers
     const ICLTensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index bd00d56468..1b0b759d74 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -40,9 +40,15 @@ namespace arm_compute
 // Forward declarations
 class CLCompileContext;
 class ICLTensor;
-class CLGEMMLowpMatrixAReductionKernel;
 class CLQLSTMLayerNormalizationKernel;
 class ITensorInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace opencl
 
 /** Basic function to run @ref CLQLSTMLayer
  *
@@ -52,8 +58,8 @@ class ITensorInfo;
  * -# @ref CLCopy                                                Copy function for copying output_state_out to output
  * -# @ref CLArithmeticAddition                                  Elementwise addition and subtraction
  * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
- * -# @ref CLGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
+ * -# @ref CLGEMMLowpOutputStage   Convert 32-bit integers into QSYMM16
+ * -# @ref opencl::kernels::ClGemmLowpMatrixAReductionKernel                      For precomputing effective biases to use
  * -# @ref CLPixelWiseMultiplication                             Elementwise multiplication
  * -# @ref CLTranspose                                           Transpose function for reshaping the weights
  * */
@@ -297,70 +303,70 @@ private:
     };
 
     // Functions used
-    CLTranspose                                       _transpose_input_to_forget_weights{};
-    CLTranspose                                       _transpose_input_to_cell_weights{};
-    CLTranspose                                       _transpose_input_to_output_weights{};
-    CLTranspose                                       _transpose_input_to_input_weights{};
-    CLTranspose                                       _transpose_recurrent_to_forget_weights{};
-    CLTranspose                                       _transpose_recurrent_to_cell_weights{};
-    CLTranspose                                       _transpose_recurrent_to_output_weights{};
-    CLTranspose                                       _transpose_recurrent_to_input_weights{};
-    CLTranspose                                       _transpose_projection_weights{};
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
-    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _projection_reduction;
-    CLArithmeticAddition                              _projection_bias_add{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_forget{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_forget{};
-    CLPixelWiseMultiplication                         _pixelwise_mul_cell_to_forget{};
-    CLGEMMLowpOutputStage                             _input_to_forget_outstage{};
-    CLGEMMLowpOutputStage                             _recurrent_to_forget_outstage{};
-    CLGEMMLowpOutputStage                             _cell_to_forget_outstage{};
-    CLArithmeticAddition                              _accumulate_input_recurrent_forget{};
-    CLArithmeticAddition                              _accumulate_cell_forget{};
-    CLActivationLayer                                 _forget_gate_sigmoid{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_cell{};
-    CLGEMMLowpOutputStage                             _input_to_cell_outstage{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_cell{};
-    CLGEMMLowpOutputStage                             _recurrent_to_cell_outstage{};
-    CLArithmeticAddition                              _accumulate_input_recurrent_modulation{};
-    CLActivationLayer                                 _cell_gate_tanh{};
-    CLArithmeticSubtraction                           _input_gate_sub{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_input{};
-    CLGEMMLowpOutputStage                             _input_to_input_outstage{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_input{};
-    CLGEMMLowpOutputStage                             _recurrent_to_input_outstage{};
-    CLArithmeticAddition                              _accumulate_input_recurrent_input{};
-    CLPixelWiseMultiplication                         _pixelwise_mul_cell_to_input{};
-    CLGEMMLowpOutputStage                             _cell_to_input_outstage{};
-    CLArithmeticAddition                              _accumulate_cell_input{};
-    CLActivationLayer                                 _input_gate_sigmoid{};
-    CLPixelWiseMultiplication                         _pixelwise_mul_forget_cell{};
-    CLPixelWiseMultiplication                         _pixelwise_mul_input_cell{};
-    CLArithmeticAddition                              _add_forget_cell{};
-    CLActivationLayer                                 _cell_clip{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_output{};
-    CLGEMMLowpOutputStage                             _input_to_output_outstage{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_output{};
-    CLGEMMLowpOutputStage                             _recurrent_to_output_outstage{};
-    CLArithmeticAddition                              _accumulate_input_recurrent_output{};
-    CLPixelWiseMultiplication                         _pixelwise_mul_cell_to_output{};
-    CLGEMMLowpOutputStage                             _cell_to_output_outstage{};
-    CLArithmeticAddition                              _accumulate_cell_to_output{};
-    CLActivationLayer                                 _output_gate_sigmoid{};
-    CLActivationLayer                                 _hidden_tanh{};
-    CLPixelWiseMultiplication                         _pixelwise_mul_hidden{};
-    CLGEMMLowpOutputStage                             _hidden_outstage{};
-    CLGEMMLowpMatrixMultiplyCore                      _mm_projection{};
-    CLGEMMLowpOutputStage                             _projection_outstage{};
-    CLArithmeticAddition                              _accumulate_projection{};
-    CLActivationLayer                                 _projection_clip{};
+    CLTranspose                                                        _transpose_input_to_forget_weights{};
+    CLTranspose                                                        _transpose_input_to_cell_weights{};
+    CLTranspose                                                        _transpose_input_to_output_weights{};
+    CLTranspose                                                        _transpose_input_to_input_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_forget_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_cell_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_output_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_input_weights{};
+    CLTranspose                                                        _transpose_projection_weights{};
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _projection_reduction;
+    CLArithmeticAddition                                               _projection_bias_add{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_forget{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_forget{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_cell_to_forget{};
+    CLGEMMLowpOutputStage                                              _input_to_forget_outstage{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_forget_outstage{};
+    CLGEMMLowpOutputStage                                              _cell_to_forget_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_forget{};
+    CLArithmeticAddition                                               _accumulate_cell_forget{};
+    CLActivationLayer                                                  _forget_gate_sigmoid{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_cell{};
+    CLGEMMLowpOutputStage                                              _input_to_cell_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_cell{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_cell_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_modulation{};
+    CLActivationLayer                                                  _cell_gate_tanh{};
+    CLArithmeticSubtraction                                            _input_gate_sub{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_input{};
+    CLGEMMLowpOutputStage                                              _input_to_input_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_input{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_input_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_input{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_cell_to_input{};
+    CLGEMMLowpOutputStage                                              _cell_to_input_outstage{};
+    CLArithmeticAddition                                               _accumulate_cell_input{};
+    CLActivationLayer                                                  _input_gate_sigmoid{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_forget_cell{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_input_cell{};
+    CLArithmeticAddition                                               _add_forget_cell{};
+    CLActivationLayer                                                  _cell_clip{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_output{};
+    CLGEMMLowpOutputStage                                              _input_to_output_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_output{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_output_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_output{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_cell_to_output{};
+    CLGEMMLowpOutputStage                                              _cell_to_output_outstage{};
+    CLArithmeticAddition                                               _accumulate_cell_to_output{};
+    CLActivationLayer                                                  _output_gate_sigmoid{};
+    CLActivationLayer                                                  _hidden_tanh{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_hidden{};
+    CLGEMMLowpOutputStage                                              _hidden_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_projection{};
+    CLGEMMLowpOutputStage                                              _projection_outstage{};
+    CLArithmeticAddition                                               _accumulate_projection{};
+    CLActivationLayer                                                  _projection_clip{};
     std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
     CLCopy _copy_output;
 
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index eb4c280295..0c8b57ff9f 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -227,7 +227,7 @@ v20.11 Public major release
       - @ref CLLogSoftmaxLayer
       - GCSoftmaxLayer
  - New OpenCL kernels / functions:
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
    - @ref CLLogicalNot
    - @ref CLLogicalAnd
    - @ref CLLogicalOr
@@ -260,13 +260,13 @@ v20.11 Public major release
    - @ref CLBatchNormalizationLayerKernel
    - CLPoolingLayerKernel
    - CLWinogradInputTransformKernel
-   - @ref CLGEMMLowpMatrixMultiplyNativeKernel
-   - @ref CLGEMMLowpMatrixAReductionKernel
-   - @ref CLGEMMLowpMatrixBReductionKernel
-   - @ref CLGEMMLowpOffsetContributionOutputStageKernel
-   - @ref CLGEMMLowpOffsetContributionKernel
+   - CLGEMMLowpMatrixMultiplyNativeKernel
+   - CLGEMMLowpMatrixAReductionKernel
+   - CLGEMMLowpMatrixBReductionKernel
+   - CLGEMMLowpOffsetContributionOutputStageKernel
+   - CLGEMMLowpOffsetContributionKernel
    - CLWinogradOutputTransformKernel
-   - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
+   - CLGEMMLowpMatrixMultiplyReshapedKernel
    - @ref CLFuseBatchNormalizationKernel
    - @ref CLDepthwiseConvolutionLayerNativeKernel
    - CLDepthConvertLayerKernel
@@ -281,11 +281,11 @@ v20.11 Public major release
    - CLLogits1DNormKernel
    - CLHeightConcatenateLayerKernel
    - CLGEMMMatrixMultiplyKernel
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
-   - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
+   - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
    - CLDepthConcatenateLayerKernel
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
  - Removed OpenCL kernels / functions:
    - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
    - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
@@ -596,9 +596,9 @@ v20.05 Public major release
      - @ref CLDeconvolutionLayer
      - @ref CLDirectDeconvolutionLayer
      - @ref CLGEMMDeconvolutionLayer
-     - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
-     - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-     - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
+     - CLGEMMLowpMatrixMultiplyReshapedKernel
+     - CLGEMMLowpQuantizeDownInt32ScaleKernel
+     - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
      - @ref CLReductionOperation
      - @ref CLReduceMean
      - @ref NEScale
@@ -655,9 +655,9 @@ v20.02 Public major release
      - @ref CLDepthwiseConvolutionLayer
      - CLDepthwiseConvolutionLayer3x3
      - @ref CLGEMMConvolutionLayer
-     - @ref CLGEMMLowpMatrixMultiplyCore
-     - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-     - @ref CLGEMMLowpMatrixMultiplyNativeKernel
+     - CLGEMMLowpMatrixMultiplyCore
+     - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+     - CLGEMMLowpMatrixMultiplyNativeKernel
      - @ref NEActivationLayer
      - NEComparisonOperationKernel
      - @ref NEConvolutionLayer
@@ -680,7 +680,7 @@ v20.02 Public major release
      - @ref NESplit
  - New OpenCL kernels / functions:
      - @ref CLFill
-     - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+     - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
  - New Arm® Neon™ kernels / functions:
      - @ref NEFill
      - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
@@ -861,7 +861,7 @@ v19.05 Public major release
     - @ref CLFFTDigitReverseKernel
     - @ref CLFFTRadixStageKernel
     - @ref CLFFTScaleKernel
-    - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+    - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
     - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
     - CLHeightConcatenateLayerKernel
     - @ref CLDirectDeconvolutionLayer
@@ -953,7 +953,7 @@ v19.02 Public major release
     - @ref CLRangeKernel / @ref CLRange
     - @ref CLUnstack
     - @ref CLGatherKernel / @ref CLGather
-    - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
+    - CLGEMMLowpMatrixMultiplyReshapedKernel
  - New CPP kernels / functions:
     - @ref CPPDetectionOutputLayer
     - @ref CPPTopKV / @ref CPPTopKVKernel
@@ -1247,8 +1247,8 @@ v17.12 Public major release
     - NEWinogradLayer / NEWinogradLayerKernel
 
  - New OpenCL kernels / functions
-    - @ref CLGEMMLowpOffsetContributionKernel / @ref CLGEMMLowpMatrixAReductionKernel / @ref CLGEMMLowpMatrixBReductionKernel / @ref CLGEMMLowpMatrixMultiplyCore
-    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+    - CLGEMMLowpOffsetContributionKernel / CLGEMMLowpMatrixAReductionKernel / CLGEMMLowpMatrixBReductionKernel / CLGEMMLowpMatrixMultiplyCore
+    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
 
  - New graph nodes for Arm® Neon™ and OpenCL
     - graph::BranchLayer
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
index 3d3f7fef1e..3c8976fddd 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
@@ -33,7 +33,7 @@
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "examples/gemm_tuner/CommonGemmExampleOptions.h"
 #include "examples/gemm_tuner/GemmTunerHelpers.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
 #include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "tests/CL/Helper.h"
 #include "utils/Utils.h"
@@ -168,8 +168,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options)
 
 } // namespace
 
-using CLGEMMReshapeLHSMatrix           = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>;
-using CLGEMMLowpMatrixMultiplyReshaped = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>;
+using ClGemmReshapeLHSMatrix           = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>;
+using ClGemmLowpMatrixMultiplyReshaped = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedKernel>;
 
 class CLGEMMLowpMatrixMultiplyReshapedExample : public Example
 {
@@ -269,20 +269,20 @@ public:
         // Validate argments
         if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d()))
         {
-            std::cerr << "Invalid arguments for CLGEMMReshapeLHSMatrixKernel." << std::endl;
+            std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl;
             return false;
         }
 
         if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info))
         {
-            std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedKernel." << std::endl;
+            std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl;
             return false;
         }
 
         // Configure functions
         reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
 
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, gemm_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info);
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -298,7 +298,8 @@ public:
         ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } });
         reshape_lhs.run(reshape_lsh_pack);
 
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
         CLScheduler::get().sync();
@@ -315,8 +316,8 @@ private:
     CLTensor                         rhs_reshaped{};
     CLTensor                         dst{};
     CLTuner                          tuner{};
-    CLGEMMReshapeLHSMatrix           reshape_lhs{};
-    CLGEMMLowpMatrixMultiplyReshaped gemm{};
+    ClGemmReshapeLHSMatrix           reshape_lhs{};
+    ClGemmLowpMatrixMultiplyReshaped gemm{};
 };
 
 /** Main test program for gemmlowp reshaped
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
index ca7b7a5f04..15c1b86c61 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
@@ -35,8 +35,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 #include "tests/CL/Helper.h"
 #include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
@@ -47,6 +47,7 @@
 
 using namespace arm_compute;
 using namespace utils;
+using namespace arm_compute::opencl::kernels;
 using namespace arm_compute::misc::shape_calculator;
 using namespace gemm_tuner;
 
@@ -146,8 +147,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options)
 
 } // namespace
 
-using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>;
-using CLGEMMLowpMatrixAReduction              = test::CLSynthetizeFunction<CLGEMMLowpMatrixAReductionKernel>;
+using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
+using ClGemmLowpMatrixAReduction              = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>;
 
 class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example
 {
@@ -289,7 +290,7 @@ public:
             const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32);
             vector_sum_row.allocator()->init(info_vector_sum_row);
 
-            mtx_a_reduction = std::make_unique<CLGEMMLowpMatrixAReduction>();
+            mtx_a_reduction = std::make_unique<ClGemmLowpMatrixAReduction>();
 
             if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}))
             {
@@ -297,7 +298,7 @@ public:
                 return false;
             }
 
-            mtx_a_reduction->configure(&lhs, &vector_sum_row, GEMMLowpReductionKernelInfo{});
+            mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{});
         }
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
         if(gemm_info.a_offset != 0)
@@ -311,12 +312,14 @@ public:
         if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
                           gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info()))
         {
-            std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel." << std::endl;
+            std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl;
             return false;
         }
 
         // Configure function
-        gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info, gemm_info.a_offset == 0 ? nullptr : &vector_sum_col, gemm_info.b_offset == 0 ? nullptr : &vector_sum_row, &bias, &dst_multipliers, &dst_shifts);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info,
+                       gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(),
+                       bias.info(), dst_multipliers.info(), dst_shifts.info());
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -335,9 +338,12 @@ public:
     {
         if(mtx_a_reduction != nullptr)
         {
-            mtx_a_reduction->run();
+            ITensorPack red_pack({ { ACL_SRC, &lhs }, { ACL_DST, &dst } });
+            mtx_a_reduction->run(red_pack);
         }
-        gemm.run();
+
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_BIAS, &bias }, { ACL_VEC_COL_SUM, &vector_sum_col }, { ACL_VEC_ROW_SUM, &vector_sum_row }, { ACL_SHIFTS, &dst_shifts }, { ACL_MULTIPLIERS, &dst_multipliers }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
         CLScheduler::get().sync();
@@ -358,8 +364,8 @@ private:
     CLTensor                                    dst_multipliers{};
     CLTensor                                    dst_shifts{};
     CLTuner                                     tuner{};
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHS     gemm{};
-    std::unique_ptr<CLGEMMLowpMatrixAReduction> mtx_a_reduction{ nullptr };
+    ClGemmLowpMatrixMultiplyReshapedOnlyRhs     gemm{};
+    std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{ nullptr };
 };
 
 /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint
diff --git a/filelist.json b/filelist.json
index bde361f167..8a53829c5b 100644
--- a/filelist.json
+++ b/filelist.json
@@ -229,6 +229,21 @@
           ]
         }
       },
+      "GEMMLowp": {
+        "files": {
+          "kernel": [
+            "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp",
+            "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp" 
+          ]
+        }
+      },
       "Mul": {
         "files": {
           "operator": [
@@ -420,69 +435,6 @@
           ]
         }
       },
-      "GEMMLowpMatrixMultiplyNative": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpMatrixMultiplyReshaped": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpMatrixMultiplyReshapedOnlyRHS": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpOffsetContribution": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpOffsetContributionOutputStage": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpQuantizeDownInt32ScaleByFixedPoint": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpQuantizeDownInt32ScaleByFloat": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpQuantizeDownInt32Scale": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowpReduction": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp"
-          ]
-        }
-      },
       "InstanceNormalization": {
         "files": {
           "kernel": [
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index c380a9cc68..09f8109445 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -42,15 +42,6 @@
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/CL/kernels/CLGatherKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 #include "src/core/CL/kernels/CLIm2ColKernel.h"
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index efba3d1d7a..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(input0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    ARM_COMPUTE_UNUSED(m);
-    ARM_COMPUTE_UNUSED(n);
-    ARM_COMPUTE_UNUSED(k);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
-
-    Window win{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // RHS matrix still needs padding on the X
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-                                     input1->dimension(1));
-
-    window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
-{
-    _type = CLKernelType::GEMM;
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                     const GEMMReshapeInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                     const GEMMRHSMatrixInfo &rhs_info,
-                                                     const GEMMReshapeInfo   &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ input0, output });
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    std::string kernel_name("gemmlowp_mm_native");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 125f0c6948..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
-class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same as lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index fd533fc310..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    TensorShape tensor_shape0{ input0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
-    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*output);
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    return std::make_pair(Status{}, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
-{
-    _type = CLKernelType::GEMM;
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                       const GEMMReshapeInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                       const GEMMRHSMatrixInfo &rhs_info,
-                                                       const GEMMReshapeInfo   &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _k                        = gemm_info.k();
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    auto              padding_info = get_padding_info({ input0, input1, output });
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
-
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_");
-    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
-    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 06a73f173d..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through:
- *  - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel
- *  - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
- */
-class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel
-     *
-     * @param[in] input0    Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index 3424b6d264..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(input0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m;
-    const int n = gemm_info.n;
-    const int k = gemm_info.k;
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info1          = input1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-        }
-    }
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
-                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
-
-    // Checks performed if the output stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]);
-        }
-
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-            // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x();
-
-            // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2]));
-            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]);
-
-            if(expected_output_shape.num_dimensions() > 1)
-            {
-                const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-                vector_sum_row_shape.collapse_from(1);
-                TensorShape collapsed_output_shape(expected_output_shape);
-                collapsed_output_shape.collapse_from(output_batch_idx);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx],
-                                                "vector_sum_row must have the same number of batches of output tensor");
-
-                if(gemm_info.a_offset != 0)
-                {
-                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                    vector_sum_col_shape.collapse_from(1);
-
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-                }
-            }
-        }
-
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-
-        if(output_multipliers != nullptr && output_shifts != nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0));
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0));
-            }
-        }
-    }
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
-{
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
-    {
-        auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type));
-    }
-    else
-    {
-        auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32));
-    }
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
-    num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        if(gemm_info.a_offset != 0)
-        {
-            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
-        }
-        // No access window needed for vector_sum_row
-        ARM_COMPUTE_UNUSED(vector_sum_row);
-
-        if(bias != nullptr)
-        {
-            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, bias_access);
-        }
-
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
-        {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
-            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
-        }
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel()
-    : _input0(nullptr),
-      _input1(nullptr),
-      _output(nullptr),
-      _vector_sum_col(nullptr),
-      _vector_sum_row(nullptr),
-      _bias(nullptr),
-      _output_multipliers(nullptr),
-      _output_shifts(nullptr),
-      _slide_matrix_b(true),
-      _reinterpret_input_as_3d(false),
-      _reinterpret_output_as_3d(false),
-      _use_dummy_work_items(false),
-      _is_quantized_per_channel(false),
-      _fuse_output_stage(false)
-{
-    _type = CLKernelType::GEMM;
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info,
-                                                              const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(),
-                                                  input1->info(),
-                                                  output->info(),
-                                                  gemm_info,
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  output_multipliers != nullptr ? output_multipliers->info() : nullptr,
-                                                  output_shifts != nullptr ? output_shifts->info() : nullptr));
-
-    auto                          padding_info = get_padding_info({ input0, input1, output, vector_sum_row });
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-    const int32_t                 a_offset     = gemm_info.a_offset;
-    const int32_t                 b_offset     = gemm_info.b_offset;
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _vector_sum_col           = vector_sum_col;
-    _vector_sum_row           = vector_sum_row;
-    _bias                     = bias;
-    _output_multipliers       = output_multipliers;
-    _output_shifts            = output_shifts;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(),
-                                                    input1->info(),
-                                                    output->info(),
-                                                    gemm_info,
-                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                    bias != nullptr ? bias->info() : nullptr,
-                                                    output_multipliers != nullptr ? output_multipliers->info() : nullptr,
-                                                    output_shifts != nullptr ? output_shifts->info() : nullptr,
-                                                    num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % internal_m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        kernel_name += "_fused_output_stage_fixedpoint";
-        _fuse_output_stage = true;
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
-        {
-            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-            build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-        }
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0)));
-        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-        build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-        build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-        build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-
-        const int min = output_stage.gemmlowp_min_bound;
-        const int max = output_stage.gemmlowp_max_bound;
-
-        PixelValue min_val{};
-        PixelValue max_val{};
-        std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_reinterpret_output_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_fuse_output_stage)
-        {
-            add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-            add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-            add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
-        }
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index e79f6dfe05..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
- * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
- */
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input0             Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1             Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output             Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in] gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                               Only the following values are supported for LHS info:
-     *                               lhs_info.m0: 2,3,4,5,6,7,8
-     *                               lhs_info.k0: 2,3,4,8,16
-     *                               Only the following values are supported for RHS info:
-     *                               rhs_info.n0: 2,3,4,8,16
-     *                               rhs_info.k0: same as lhs_info.k0
-     *                               rhs_info.transpose: true
-     * @param[in] vector_sum_col     (Optional) Input row-vector info of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in] vector_sum_row     (Optional) Input row-vector info of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in] bias               (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     * @param[in] output_shifts      (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr,
-                           const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr,
-                           const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _is_quantized_per_channel;
-    bool             _fuse_output_stage;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
deleted file mode 100644
index b97bb84e59..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                                                   int32_t b_offset)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset);
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
-                                                   const ICLTensor *bias,
-                                                   int32_t k, int32_t a_offset,
-                                                   int32_t b_offset)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  a_offset, b_offset)); // NOLINT
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
-
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _mm_result      = mm_result;
-    _bias           = bias;
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
-    return Status{};
-}
-
-void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _mm_result, slice);
-        add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index f8705595a0..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class CLGEMMLowpOffsetContributionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
-     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
-     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k               Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                   int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    ICLTensor       *_mm_result;
-    const ICLTensor *_bias;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index eaa832fbaa..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
-    // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
-    : _mm_result(nullptr),
-      _vector_sum_col(nullptr),
-      _vector_sum_row(nullptr),
-      _bias(nullptr),
-      _output(nullptr),
-      _output_multipliers(nullptr),
-      _output_shifts(nullptr),
-      _is_quantized_per_channel(false)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
-                                                              const ICLTensor *bias, ICLTensor *output,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  a_offset, b_offset, output_stage,
-                                                  output_multipliers->info(), output_shifts->info())); // NOLINT
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts });
-
-    const int min = output_stage.gemmlowp_min_bound;
-    const int max = output_stage.gemmlowp_max_bound;
-
-    _vector_sum_col           = vector_sum_col;
-    _vector_sum_row           = vector_sum_row;
-    _mm_result                = mm_result;
-    _bias                     = bias;
-    _output                   = output;
-    _output_multipliers       = output_multipliers;
-    _output_shifts            = output_shifts;
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    // Auto initialize the output
-    auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-    build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    PixelValue min_val{};
-    PixelValue max_val{};
-    std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-    build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-    kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
-    return Status{};
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _mm_result, slice);
-        add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 15f54d17a5..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
- * of matrix A and matrix B and performs the output stage defined by the output_stage argument
- *
- * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
- */
-class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
-                   int32_t k,
-                   int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result          Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] a_offset           Offset to be added to each element of the matrix A.
-     * @param[in] b_offset           Offset to be added to each element of the matrix B.
-     * @param[in] output_stage       GEMMLowp output stage info
-     * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     * @param[in] output_shifts      Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_mm_result;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized_per_channel;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 03bdf3f836..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                    const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                   const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = info->gemmlowp_min_bound;
-    auto           max = info->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
deleted file mode 100644
index 8653102cd8..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16.
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
deleted file mode 100644
index abef484c1e..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-class Coordinates;
-CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                               const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                              const GEMMLowpOutputStageInfo *info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                              const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    auto min = info->gemmlowp_min_bound;
-    auto max = info->gemmlowp_max_bound;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
-    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
deleted file mode 100644
index 0a8d5e1942..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: S32
-     * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info   Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index faf86c769e..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} //namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                       const GEMMLowpOutputStageInfo *output_stage)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  output_stage));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = output_stage->gemmlowp_min_bound;
-    auto           max = output_stage->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index abdf33ea43..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage    GEMMLowp output stage metadata.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] input        Input tensor. Data type supported: S32
-     * @param[in] bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                         Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
deleted file mode 100644
index 55ae3ea6ca..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
-    }
-    return Status{};
-}
-
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
-    }
-    return Status{};
-}
-} // namespace
-
-ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
-    : _input(), _output()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info);
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
-
-    _input  = mtx_a;
-    _output = vector_sum_row;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-
-    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    // This kernel does not need padding
-    Window win = calculate_max_window(*vector_sum_row->info(), Steps());
-    ICLKernel::configure_internal(win);
-
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
-    Window slice_in  = collapsed.first_slice_window_2D();
-    Window slice_out = collapsed.first_slice_window_2D();
-
-    // Setup input slice. Its dimensions are increased in the cl kernel.
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info);
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
-
-    _input  = mtx_b;
-    _output = vector_sum_col;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
-    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
-    Window slice_out = collapsed.first_slice_window_2D();
-    Window slice_in  = slice_out;
-
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
deleted file mode 100644
index 237d8099b7..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all OpenCL reduction kernels */
-class ICLGEMMLowpReductionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    ICLGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
new file mode 100644
index 0000000000..ec0a3bf8e0
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    if(src0->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    ARM_COMPUTE_UNUSED(m);
+    ARM_COMPUTE_UNUSED(n);
+    ARM_COMPUTE_UNUSED(k);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
+    if(gemm_info.reinterpret_input_as_3d())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+    }
+
+    if(dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
+    bool          reinterpret_dst_as_3d               = (gemm_info.depth_output_gemm3d() != 0);
+
+    Window win{};
+    bool   window_changed = false;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_dst_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_dst_as_3d)
+    {
+        reinterpret_dst_as_3d = false;
+    }
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+
+    TensorInfo tmp_info(*dst);
+
+    if(reinterpret_dst_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // RHS matrix still needs padding on the X
+    AccessWindowStatic src1_access(src1, 0, 0,
+                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
+                                   src1->dimension(1));
+
+    window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
+                                                     const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+    // We still need padding on the X dimension for the RHS matrix
+    auto padding_info = get_padding_info({ src0, dst });
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
+    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_src0 = src0->num_dimensions();
+    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    std::string kernel_name("gemmlowp_mm_native");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+    _config_id += "_";
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
+                                                              src1->clone().get(),
+                                                              dst->clone().get(),
+                                                              lhs_info,
+                                                              rhs_info,
+                                                              gemm_info,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if(src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, src0, slice);
+        add_2D_tensor_argument(idx, src1, slice_b);
+        add_2D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
new file mode 100644
index 0000000000..491c3e44df
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
+class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel);
+    /** Initialise the kernel's input and dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  src1            Source tensor containing the RHS matrix. Data type supported: same as @p src0
+     * @param[out] dst             Destination tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same as lhs_info.k0
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
+                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
+                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _slide_matrix_b{ true };
+    bool _reinterpret_input_as_3d{ false };
+    bool _reinterpret_output_as_3d{ false };
+    bool _use_dummy_work_items{ false };
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H*/
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000000..44fda01ded
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
+                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    TensorShape tensor_shape0{ src0->tensor_shape() };
+    tensor_shape0.set(0, k);
+    tensor_shape0.set(1, m);
+
+    TensorShape tensor_shape1{ src1->tensor_shape() };
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+
+    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    if(dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info,
+                                                        ElementsProcessed &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+
+    TensorInfo tmp_info(*dst);
+    if(reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    return std::make_pair(Status{}, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
+                                                       const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _k                        = gemm_info.k();
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensionssrc0 = src0->num_dimensions();
+    _slide_matrix_b                       = (src1->num_dimensions() >= num_dimensionssrc0);
+
+    auto              padding_info = get_padding_info({ src0, src1, dst });
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
+
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_");
+    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+    _config_id += "_";
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.v0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.interleave);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
+                                                              src1->clone().get(),
+                                                              dst->clone().get(),
+                                                              lhs_info,
+                                                              rhs_info,
+                                                              gemm_info,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if(src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
+        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, src0, slice);
+        add_2D_tensor_argument(idx, src1, slice_b);
+        add_2D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
new file mode 100644
index 0000000000..b99dec33af
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
+ *
+ * @note The input matrices @p src0 and @p src1 must be reshaped through:
+ *  - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel
+ *  - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ */
+class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyReshapedKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel);
+    /** Initialise the kernel's input and dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  src1            Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[out] dst             Destination tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info        LHS matrix information used for reshaping the src0 tensor.  Only the following values are supported:
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     *                             lhs_info.transpose: false
+     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same as lhs_info.k0
+     *                             rhs_info.transpose: true
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
+                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool         _slide_matrix_b{ true };
+    bool         _reinterpret_output_as_3d{ false };
+    unsigned int _k{ 1 };
+    bool         _use_dummy_work_items{ false };
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H*/
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
new file mode 100644
index 0000000000..9d626936ff
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    if(src0->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m;
+    const int n = gemm_info.n;
+    const int k = gemm_info.k;
+
+    TensorShape tensor_shape1{ src1->tensor_shape() };
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+    if(gemm_info.reinterpret_input_as_3d)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if(dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+        }
+    }
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
+
+    // Checks performed if the dst stage needs to be fused
+    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if(gemm_info.a_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
+        }
+
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        if(gemm_info.b_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+            // Check if mm result is a 3D reinterpretation
+            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+
+            // Validate input
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
+
+            if(expected_dst_shape.num_dimensions() > 1)
+            {
+                const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+                vector_sum_row_shape.collapse_from(1);
+                TensorShape collapsed_dst_shape(expected_dst_shape);
+                collapsed_dst_shape.collapse_from(dst_batch_idx);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
+                                                "vector_sum_row must have the same number of batches of dst tensor");
+
+                if(gemm_info.a_offset != 0)
+                {
+                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                    vector_sum_col_shape.collapse_from(1);
+
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                }
+            }
+        }
+
+        if(dst->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+        if(output_multipliers != nullptr && output_shifts != nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+            if(output_stage.is_quantized_per_channel)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
+            }
+        }
+    }
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
+                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+{
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    // dst tensor auto initialization if not yet initialized
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    {
+        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+    }
+    else
+    {
+        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32));
+    }
+
+    TensorInfo tmp_info(*dst);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
+    num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
+
+    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        if(gemm_info.a_offset != 0)
+        {
+            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
+        }
+        // No access window needed for vector_sum_row
+        ARM_COMPUTE_UNUSED(vector_sum_row);
+
+        if(bias != nullptr)
+        {
+            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win_out, bias_access);
+        }
+
+        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        {
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
+        }
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
+                                                              const GEMMKernelInfo &gemm_info,
+                                                              ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
+                                                              ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+
+    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+    const int32_t                 a_offset     = gemm_info.a_offset;
+    const int32_t                 b_offset     = gemm_info.b_offset;
+
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_src0 = src0->num_dimensions();
+    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % internal_m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
+    kernel_name += rhs_info.transpose ? "t" : "nt";
+
+    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        kernel_name += "_fused_output_stage_fixedpoint";
+        _fuse_output_stage = true;
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if(a_offset != 0 && vector_sum_col != nullptr)
+        {
+            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+            build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+        }
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0)));
+        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+        build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+        build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+        build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
+
+        const int min = output_stage.gemmlowp_min_bound;
+        const int max = output_stage.gemmlowp_max_bound;
+
+        PixelValue min_val{};
+        PixelValue max_val{};
+        std::tie(min_val, max_val) = get_min_max(dst->data_type());
+        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+    _config_id += "_";
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
+                                                              src1->clone().get(),
+                                                              dst->clone().get(),
+                                                              gemm_info,
+                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              bias != nullptr ? bias->clone().get() : nullptr,
+                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if(src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, src0, slice);
+        add_2D_tensor_argument(idx, src1, slice_b);
+        add_2D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+        if(_reinterpret_input_as_3d)
+        {
+            // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+            idx++;
+        }
+
+        if(_reinterpret_output_as_3d)
+        {
+            // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+            idx++;
+        }
+
+        if(_fuse_output_stage)
+        {
+            add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+            add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+            add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
+            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
+        }
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
new file mode 100644
index 0000000000..9e52b38249
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped
+ *
+ * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
+ */
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  src0               Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  src1               Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0
+     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+     *                                Only the following values are supported for LHS info:
+     *                                lhs_info.m0: 2,3,4,5,6,7,8
+     *                                lhs_info.k0: 2,3,4,8,16
+     *                                Only the following values are supported for RHS info:
+     *                                rhs_info.n0: 2,3,4,8,16
+     *                                rhs_info.k0: same as lhs_info.k0
+     *                                rhs_info.transpose: true
+     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
+                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
+                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _slide_matrix_b{ true };
+    bool _reinterpret_input_as_3d{ false };
+    bool _reinterpret_output_as_3d{ false };
+    bool _use_dummy_work_items{ false };
+    bool _is_quantized_per_channel{ false };
+    bool _fuse_output_stage{ false };
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000000..e491cca914
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                          int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
+                                                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                                                   int32_t k, int32_t a_offset, int32_t b_offset)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+
+    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr
+                                   && mm_result->num_dimensions() > 1
+                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+        build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+    }
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    std::string kernel_name("gemmlowp_offset_contribution");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+    IClKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name + "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                                                    int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+    return Status{};
+}
+
+void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
+
+    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto bias           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto mm_result      = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
+
+    Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, mm_result, slice);
+        add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+        add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+        add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
new file mode 100644
index 0000000000..d1712f4f4b
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class ClGemmLowpOffsetContributionKernel : public IClKernel
+{
+public:
+    ClGemmLowpOffsetContributionKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
+     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
+     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in]      k               Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                   int32_t k, int32_t a_offset, int32_t b_offset);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000000..1e2d7d7efe
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst,
+                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+    if(output_stage.is_quantized_per_channel)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
+    // Checks performed when output is configured
+    if((dst != nullptr) && (dst->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context,
+                                                              const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
+                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
+                                                              const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+
+    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts });
+
+    const int min = output_stage.gemmlowp_min_bound;
+    const int max = output_stage.gemmlowp_max_bound;
+
+    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr
+                                   && mm_result->num_dimensions() > 1
+                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+    // Auto initialize the output
+    auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+        build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+    }
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+    build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+
+    PixelValue min_val{};
+    PixelValue max_val{};
+    std::tie(min_val, max_val) = get_min_max(dst->data_type());
+    build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+
+    std::string kernel_name("gemmlowp_offset_contribution");
+    kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name + "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                                                               const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
+                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    return Status{};
+}
+
+void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto mm_result          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, mm_result, slice);
+        add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+        add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+        add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
+        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
new file mode 100644
index 0000000000..977f2eac53
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
+ * of matrix A and matrix B and performs the output stage defined by the output_stage argument
+ *
+ * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
+ */
+class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel
+{
+public:
+    ClGemmLowpOffsetContributionOutputStageKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
+     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  k                  Number of matrix A columns or Matrix B rows
+     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
+     * @param[in]  output_stage       GEMMLowp output stage info
+     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
+                   int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
+                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
+                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _is_quantized_per_channel{ false };
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..8aec1654d9
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                                                                    const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+
+    return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                                   const GEMMLowpOutputStageInfo *info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+    auto padding_info = get_padding_info({ src, bias, dst });
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    // Set the arguments to pass at compile time
+    auto           min = info->gemmlowp_min_bound;
+    auto           max = info->gemmlowp_max_bound;
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
+    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
+                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
+                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
+    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Create src window
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Setup bias slice
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx1, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..c935aa7ec4
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel
+{
+public:
+    ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
+     * @param[in]  info            Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
new file mode 100644
index 0000000000..9b488ff329
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
+                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                                                               const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+
+    return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                              const GEMMLowpOutputStageInfo *info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+    auto padding_info = get_padding_info({ src, bias, dst });
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    auto min = info->gemmlowp_min_bound;
+    auto max = info->gemmlowp_max_bound;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
+    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Create input window
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Setup bias slice
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx1, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
new file mode 100644
index 0000000000..eff8c4b2be
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel
+{
+public:
+    ClGemmLowpQuantizeDownInt32ScaleByFloatKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  info            Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
new file mode 100644
index 0000000000..9a25973a93
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
+                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} //namespace
+
+ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
+
+    return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                       const GEMMLowpOutputStageInfo *output_stage)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
+
+    auto padding_info = get_padding_info({ src, bias, dst });
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    // Set the arguments to pass at compile time
+    auto           min = output_stage->gemmlowp_min_bound;
+    auto           max = output_stage->gemmlowp_max_bound;
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
+    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx1, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
new file mode 100644
index 0000000000..c5374755c8
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *  -#  -to the [0..255] range and cast to QASYMM8.
+ *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel
+{
+public:
+    ClGemmLowpQuantizeDownInt32ScaleKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  output_stage    GEMMLowp output stage metadata.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
new file mode 100644
index 0000000000..b4886805fb
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+
+    if(dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+    }
+    return Status{};
+}
+
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+
+    if(dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+    }
+    return Status{};
+}
+} // namespace
+
+IClGemmLowpReductionKernel::IClGemmLowpReductionKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
+
+    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type()));
+    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
+
+    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+
+    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    // This kernel does not need padding
+    Window win = calculate_max_window(*vector_sum_row, Steps());
+    ICLKernel::configure_internal(win);
+
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mtx_a->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mtx_a->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mtx_a->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+    Window slice_in  = collapsed.first_slice_window_2D();
+    Window slice_out = collapsed.first_slice_window_2D();
+
+    // Setup input slice. Its dimensions are increased in the cl kernel.
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice_in);
+        add_2D_tensor_argument(idx, dst, slice_out);
+        enqueue(queue, *this, slice_out, lws_hint());
+    }
+    while(collapsed.slide_window_slice_2D(slice_out));
+}
+
+void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
+
+    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
+    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type()));
+    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration));
+    IClKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+    Window slice_out = collapsed.first_slice_window_2D();
+    Window slice_in  = slice_out;
+
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice_in);
+        add_2D_tensor_argument(idx, dst, slice_out);
+        enqueue(queue, *this, slice_out, lws_hint());
+    }
+    while(collapsed.slide_window_slice_2D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h
new file mode 100644
index 0000000000..11188ed062
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Common interface for all OpenCL reduction kernels */
+class IClGemmLowpReductionKernel : public IClKernel
+{
+public:
+    IClGemmLowpReductionKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 31c8908270..bc9a3056e8 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -29,11 +29,6 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
 #include "support/Cast.h"
 
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 188f3b8819..cef8ad5a0d 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -31,11 +31,6 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLCol2ImKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index d5d1b5f41e..bab29a5095 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -30,11 +30,6 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 3be09581bd..6c64731f73 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -34,12 +34,12 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/gpu/cl/kernels/ClCastKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
@@ -49,6 +49,7 @@ namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
+using namespace arm_compute::opencl::kernels;
 
 namespace
 {
@@ -95,7 +96,7 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons
     // NOTE: This assumes:
     //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
     //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
-    if(!bool(CLGEMMLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
+    if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
     {
         return false;
     }
@@ -127,15 +128,15 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
     // Validate mm kernel
     // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
     // NOTE: This assumes:
-    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
-    //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
     GEMMKernelInfo gemm_kernel_info;
     gemm_kernel_info.m                       = m;
     gemm_kernel_info.n                       = n;
@@ -147,7 +148,7 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
     {
         return false;
     }
@@ -189,14 +190,14 @@ inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _weights_to_qasymm8(std::make_unique<opencl::kernels::ClCastKernel>()),
-      _mm_native_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()),
-      _mm_reshaped_only_rhs_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()),
-      _mtx_b_reshape_kernel(std::make_unique<opencl::kernels::ClGemmReshapeRhsMatrixKernel>()),
-      _mtx_a_reduction_kernel(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _mtx_b_reduction_kernel(std::make_unique<CLGEMMLowpMatrixBReductionKernel>()),
-      _offset_contribution_kernel(std::make_unique<CLGEMMLowpOffsetContributionKernel>()),
-      _offset_contribution_output_stage_kernel(std::make_unique<CLGEMMLowpOffsetContributionOutputStageKernel>()),
+      _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
+      _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
+      _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
+      _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
+      _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
+      _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
+      _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
       _qasymm8_weights(),
       _vector_sum_col(),
       _vector_sum_row(),
@@ -206,6 +207,7 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
       _gemm_output_stage_shifts(),
       _matrix_a(nullptr),
       _original_b(nullptr),
+      _c(nullptr),
       _output(nullptr),
       _a_offset(0),
       _b_offset(0),
@@ -235,6 +237,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->info()->quantization_info().uniform().offset;
     _matrix_a                    = a;
+    _c                           = c;
     _output                      = output;
 
     _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
@@ -309,7 +312,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         }
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), _vector_sum_col.info(), reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -320,7 +323,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         _memory_group.manage(&_vector_sum_row);
 
         // Configure matrix A reduction kernel
-        _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
+        _mtx_a_reduction_kernel->configure(compile_context, a->info(), _vector_sum_row.info(), reduction_info);
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -356,8 +359,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), gemm_kernel_info, _a_offset == 0 ? nullptr : _vector_sum_col.info(),
+                                                    _b_offset == 0 ? nullptr : _vector_sum_row.info(), c != nullptr ? c->info() : nullptr, _gemm_output_stage_multipliers.info(), _gemm_output_stage_shifts.info());
         }
         else
         {
@@ -367,7 +370,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
             if(_is_gemm_reshaped)
             {
-                _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), _mm_result_s32.info(), gemm_kernel_info);
             }
             else
             {
@@ -377,11 +380,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
                                                                               _matrix_a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : matrix_b->info(), reshape_info);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
+                _mm_native_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), _mm_result_s32.info(), lhs_info, rhs_info, reshape_info);
 
-                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
-                                                                    a->info()->dimension(0),
-                                                                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+                _offset_contribution_output_stage_kernel->configure(compile_context, _mm_result_s32.info(), _a_offset == 0 ? nullptr : _vector_sum_col.info(), _b_offset == 0 ? nullptr : _vector_sum_row.info(),
+                                                                    c != nullptr ? c->info() : nullptr, output->info(), a->info()->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
+                                                                    _gemm_output_stage_multipliers.info(), _gemm_output_stage_shifts.info());
                 _mm_result_s32.allocator()->allocate();
             }
         }
@@ -402,7 +405,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), gemm_kernel_info);
         }
         else
         {
@@ -412,12 +415,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
                                                                           a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), reshape_info);
 
             // Configure matrix multiply kernel
-            _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, reshape_info);
+            _mm_native_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), lhs_info, rhs_info, reshape_info);
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
-                                               _b_offset);
+        _offset_contribution_kernel->configure(compile_context, output->info(), _a_offset == 0 ? nullptr : _vector_sum_col.info(), _b_offset == 0 ? nullptr : _vector_sum_row.info(),
+                                               c != nullptr ? c->info() : nullptr, a->info()->dimension(0), _a_offset, _b_offset);
     }
 
     // Allocate tensors
@@ -480,7 +483,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     {
         b_offset = -128;
         weights_info.set_data_type(DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
     }
     const ITensorInfo *matrix_b_info = &weights_info;
     if(reshape_matrix_b)
@@ -496,7 +499,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
 
         // Validate reshape RHS kernel
         auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
     }
 
     TensorInfo info_vector_sum_col{};
@@ -509,7 +512,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
     }
 
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -518,7 +521,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
         // Configure matrix A reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -543,7 +546,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
         if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
                                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
                                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
                                                                                                 c,
@@ -560,7 +563,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
                 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
             }
             else
             {
@@ -575,11 +578,11 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
                 rhs_info       = res.rhs_info;
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
                                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
                                                                                                 c,
@@ -595,7 +598,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         if(reshape_matrix_b)
         {
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
         }
         else
         {
@@ -606,13 +609,13 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
             rhs_info       = res.rhs_info;
 
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
 
         if(output->total_size() != 0)
         {
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,
                                                                                      a_offset == 0 ? nullptr : &info_vector_sum_col,
                                                                                      b_offset == 0 ? nullptr : &info_vector_sum_row,
                                                                                      c,
@@ -629,48 +632,83 @@ void CLGEMMLowpMatrixMultiplyCore::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
+    const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : _original_b;
+
     if(_is_gemm_reshaped)
     {
+        matrix_b = &_tmp_b;
         if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
-            ITensorPack mtx_b_pack;
-            mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b);
-            mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b);
-            CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
+            ITensorPack mtx_b_reshape_pack =
+            {
+                { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b },
+                { TensorType::ACL_DST, &_tmp_b }
+            };
+            CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
         }
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
     if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
+        ITensorPack mtx_b_red_pack =
+        {
+            { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b },
+            { TensorType::ACL_DST, &_vector_sum_col }
+        };
+        CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
     {
-        CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false);
+        ITensorPack mtx_a_red_pack = { { TensorType::ACL_SRC, _matrix_a }, { TensorType::ACL_DST, &_vector_sum_row } };
+        CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
     }
 
     // Run matrix multiply
     if(_is_gemm_reshaped)
     {
-        CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false);
+        ITensorPack gemm_reshaped_pack;
+        if(_run_offset_contribution)
+        {
+            gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_DST, _run_output_stage ? &_mm_result_s32 : _output } });
+        }
+        else
+        {
+            gemm_reshaped_pack = ITensorPack(
+            {
+                { TensorType::ACL_SRC, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : &_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : &_vector_sum_col }, { TensorType::ACL_SHIFTS, &_gemm_output_stage_shifts }, { TensorType::ACL_MULTIPLIERS, &_gemm_output_stage_multipliers }, { TensorType::ACL_DST, _output },
+            });
+        }
+        CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
     }
     else
     {
-        CLScheduler::get().enqueue(*_mm_native_kernel, false);
+        ITensorPack gemm_native_pack =
+        {
+            { TensorType::ACL_SRC_0, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_DST, _run_offset_contribution ? _output :&_mm_result_s32 }
+        };
+        CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
     }
     if(_run_output_stage)
     {
         // Run offset contribution/output stage kernel
-        CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true);
+        ITensorPack output_stage_pack =
+        {
+            { TensorType::ACL_SRC, &_mm_result_s32 }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr :&_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr :&_vector_sum_col }, { TensorType::ACL_SHIFTS, &_gemm_output_stage_shifts }, { TensorType::ACL_MULTIPLIERS, &_gemm_output_stage_multipliers }, { TensorType::ACL_DST, _output },
+        };
+        CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
     }
     if(_run_offset_contribution)
     {
         // Run offset contribution kernel
-        CLScheduler::get().enqueue(*_offset_contribution_kernel, true);
+        ITensorPack offset_contrib_pack =
+        {
+            { TensorType::ACL_SRC_DST, _output }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr :&_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr :&_vector_sum_col }
+        };
+        CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
     }
 }
 
@@ -691,9 +729,11 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
 
             // Run reshape kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            ITensorPack mtx_b_pack;
-            mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b);
-            mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b);
+            ITensorPack mtx_b_pack =
+            {
+                { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b },
+                { TensorType::ACL_DST, &_tmp_b }
+            };
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
             _original_b->mark_as_unused();
         }
@@ -702,7 +742,12 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
         if(_a_offset != 0 && _reshape_b_only_on_first_run)
         {
             _vector_sum_col.allocator()->allocate();
-            CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
+            ITensorPack mtx_b_red_pack =
+            {
+                { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b },
+                { TensorType::ACL_DST, &_vector_sum_col }
+            };
+            CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
         }
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index be452aaf3d..e230e8f2e6 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,111 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
 
 #include <algorithm>
 
 namespace arm_compute
 {
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                    int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                    int min, int max)
-{
-    GEMMLowpOutputStageInfo info{};
-    info.gemmlowp_multiplier = result_fixedpoint_multiplier;
-    info.gemmlowp_shift      = result_shift;
-    info.gemmlowp_offset     = result_offset_after_shift;
-    info.gemmlowp_min_bound  = min;
-    info.gemmlowp_max_bound  = max;
-    info.output_data_type    = DataType::QASYMM8;
-    auto k                   = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                     int min, int max)
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage()
+    : _kernel(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
 {
-    GEMMLowpOutputStageInfo info{};
-    info.gemmlowp_min_bound = min;
-    info.gemmlowp_max_bound = max;
-    info.output_data_type   = DataType::QASYMM8;
-    return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                   int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                   int min, int max)
-{
-    GEMMLowpOutputStageInfo info{};
-    info.gemmlowp_multiplier = result_fixedpoint_multiplier;
-    info.gemmlowp_shift      = result_shift;
-    info.gemmlowp_offset     = result_offset_after_shift;
-    info.gemmlowp_min_bound  = min;
-    info.gemmlowp_max_bound  = max;
-    info.output_data_type    = DataType::QASYMM8_SIGNED;
-    auto k                   = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                    int min, int max)
-{
-    GEMMLowpOutputStageInfo info{};
-    info.gemmlowp_min_bound = min;
-    info.gemmlowp_max_bound = max;
-    info.output_data_type   = DataType::QASYMM8_SIGNED;
-    return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift,
-                                                                    int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift,
-                                                                    int min, int max)
-{
-    GEMMLowpOutputStageInfo info{};
-    info.gemmlowp_multiplier = result_fixedpoint_multiplier;
-    info.gemmlowp_shift      = result_shift;
-    info.gemmlowp_min_bound  = min;
-    info.gemmlowp_max_bound  = max;
-    info.output_data_type    = DataType::QSYMM16;
-    auto k                   = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                     int min, int max)
-{
-    GEMMLowpOutputStageInfo info{};
-    info.gemmlowp_min_bound = min;
-    info.gemmlowp_max_bound = max;
-    info.output_data_type   = DataType::QSYMM16;
-    return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
 }
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage()                                   = default;
 
 void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
 {
@@ -140,26 +52,30 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
     switch(info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
-            k->configure(compile_context, input, bias, output, &info);
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+            k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info);
             _kernel = std::move(k);
             break;
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
-            auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-            k->configure(compile_context, input, bias, output, &info);
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>();
+            k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info);
             _kernel = std::move(k);
             break;
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
         {
-            auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-            k->configure(compile_context, input, bias, output, &info);
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>();
+            k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info);
             _kernel = std::move(k);
             break;
         }
@@ -176,13 +92,19 @@ Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorIn
     switch(info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-            return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-            return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
-            return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
         default:
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
     }
 }
+
+void CLGEMMLowpOutputStage::run()
+{
+    ITensorPack pack{ { ACL_SRC, _input }, { ACL_BIAS, _bias }, { ACL_DST, _output } };
+    CLScheduler::get().enqueue_op(*_kernel, pack, true);
+}
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 85d13c246e..9754bdcb82 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -30,11 +30,6 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index a44dcd2e24..589523a3c3 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -28,11 +28,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
@@ -179,7 +174,13 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     _memory_group.manage(&_output_lowp);
-    _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+
+    GEMMLowpOutputStageInfo info{};
+    info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    info.gemmlowp_multiplier = output_multiplier;
+    info.gemmlowp_shift      = output_shift;
+    info.output_data_type    = DataType::QSYMM16;
+    _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, info);
     _output_highp.allocator()->allocate();
     _bias.allocator()->allocate();
 
@@ -386,7 +387,12 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+    GEMMLowpOutputStageInfo info{};
+    info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    info.gemmlowp_multiplier = output_multiplier;
+    info.gemmlowp_shift      = output_shift;
+    info.output_data_type    = DataType::QSYMM16;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info));
 
     TensorInfo input_gate_input;
     TensorInfo forget_gate_input;
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index fcf5b9d2a4..5df895a91c 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -31,17 +31,14 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::utils::info_helpers;
+using namespace arm_compute::opencl::kernels;
 namespace
 {
 Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
@@ -93,15 +90,15 @@ void CLQLSTMLayer::TensorCopyKernel::run()
 }
 
 CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _input_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _recurrent_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _input_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _recurrent_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _input_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _recurrent_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _input_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _recurrent_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
-      _projection_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+    : _input_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _input_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _input_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _input_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _projection_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
       _layer_norms(),
       _copy_output()
 {
@@ -247,18 +244,22 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
+                                                 -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
+                                              -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
+                                            true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
+                                              -qoutput_state_in.offset, true));
     if(_has_projection)
     {
-        _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
         if(_projection_bias != nullptr)
         {
             _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
@@ -677,19 +678,19 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
     if(!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
                                                                                true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     if(lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
                                                                                lstm_params.hidden_state_zero(),
                                                                                true)));
         if(lstm_params.projection_bias() != nullptr)
@@ -1128,8 +1129,12 @@ void CLQLSTMLayer::prepare()
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            CLScheduler::get().enqueue(*_input_to_input_reduction);
-            CLScheduler::get().enqueue(*_recurrent_to_input_reduction);
+
+            ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } };
+            CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
+
+            ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } };
+            CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1144,17 +1149,30 @@ void CLQLSTMLayer::prepare()
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        CLScheduler::get().enqueue(*_input_to_forget_reduction);
-        CLScheduler::get().enqueue(*_recurrent_to_forget_reduction);
-        CLScheduler::get().enqueue(*_input_to_cell_reduction);
-        CLScheduler::get().enqueue(*_recurrent_to_cell_reduction);
-        CLScheduler::get().enqueue(*_input_to_output_reduction);
-        CLScheduler::get().enqueue(*_recurrent_to_output_reduction);
+
+        ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } };
+        CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
+
+        ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } };
+        CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
+
+        ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } };
+        CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
+
+        ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } };
+        CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
+
+        ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } };
+        CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
+
+        ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } };
+        CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
 
         if(_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            CLScheduler::get().enqueue(*_projection_reduction);
+            ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } };
+            CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
             if(_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 755fa40121..20deef4edf 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -29,11 +29,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 1c7446f653..52adb94c83 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -213,140 +213,6 @@ TEST_SUITE_END() // BoundedReLu
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // QuantizeDownInt32Scale
 
-TEST_SUITE(QuantizeDownInt32ScaleByFixedPoint)
-
-TEST_SUITE(QASYMM8)
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-using CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QASYMM8
-TEST_SUITE(QASYMM8_SIGNED)
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2)
-                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2)
-                                                                        * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128, -126) * framework::dataset::make("max", 110, 112) * framework::dataset::make("addBias", { false, true });
-using CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QASYMM8_SIGNED
-TEST_SUITE(QSYMM16)
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823,
-                                                                                                        1073741825)
-                                                                               * framework::dataset::make("result_shift", -3,
-                                                                                                          -2)
-                                                                               * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600,
-                                                                                                             254601602)
-                                                                                    * framework::dataset::make("result_shift", -3,
-                                                                                                               -1)
-                                                                                    * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-TEST_SUITE(NoRelu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // NoRelu
-TEST_SUITE(BoundedReLu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QSYMM16
-TEST_SUITE_END() // QuantizeDownInt32ScaleByFixedPoint
-
 TEST_SUITE(QuantizeDownInt32ScaleByFloat)
 
 TEST_SUITE(QASYMM8)
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
index 1057af95f2..d733a00296 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
@@ -41,7 +41,7 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 // Create function for CLGEMMMatrixMultiplyNativeKernel
-using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyNativeKernel>;
+using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyNativeKernel>;
 
 // Fixture for CLGEMMLowpMatrixMultiplyNative
 using CLGEMMLowpMatrixMultiplyNativeFixture = GEMMLowpMatrixMultiplyNativeValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyNative>;
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
index 68a7d055ad..3baa39bffc 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
 #include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
@@ -49,7 +49,7 @@ using CLGEMMReshapeLHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmResha
 using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>;
 
 // Create function for CLGEMMLowpMatrixMultiplyReshapedKernel
-using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>;
+using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedKernel>;
 
 // Fixture for CLGEMMLowpMatrixMultiplyReshaped
 using CLGEMMLowpMatrixMultiplyReshapedFixture = GEMMLowpMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshaped>;
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
index 43b86b51e8..1283713c4d 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
@@ -25,7 +25,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
 #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
@@ -49,7 +49,7 @@ using namespace arm_compute::misc::shape_calculator;
 using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>;
 
 // Create function for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>;
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
 
 // Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS
 using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFixture = GEMMLowpMatrixMultiplyReshapedOnlyRHSValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS>;
@@ -157,7 +157,7 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
 
     // Create and configure function
     CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm;
-    gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info);
+    gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info);
 }
 } // namespace
 
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index ab9d35de0f..5e2154592e 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -959,7 +959,7 @@ protected:
         GEMMFunctionType       gemm;
         reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
         reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
 
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
@@ -988,7 +988,8 @@ protected:
         reshape_lhs.run(reshape_lhs_pack);
         ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
         reshape_rhs.run(reshape_rhs_pack);
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1113,7 +1114,7 @@ protected:
         GEMMFunctionType       gemm;
         reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
         reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
 
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
@@ -1142,7 +1143,8 @@ protected:
         reshape_lhs.run(reshape_lhs_pack);
         ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
         reshape_rhs.run(reshape_rhs_pack);
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1266,7 +1268,7 @@ protected:
         ReshapeRHSOperatorType reshape_rhs;
         GEMMFunctionType       gemm;
         reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info);
 
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
@@ -1291,7 +1293,8 @@ protected:
         // Compute GEMM
         ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
         reshape_rhs.run(reshape_rhs_pack);
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1412,7 +1415,7 @@ protected:
         ReshapeRHSOperatorType reshape_rhs;
         GEMMFunctionType       gemm;
         reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info);
 
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
@@ -1437,7 +1440,8 @@ protected:
         // Compute GEMM
         ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
         reshape_rhs.run(reshape_rhs_pack);
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1527,7 +1531,7 @@ protected:
 
         // Create and configure function
         GEMMFunctionType gemm;
-        gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
+        gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
 
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
@@ -1548,7 +1552,8 @@ protected:
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1624,7 +1629,7 @@ protected:
 
         // Create and configure function
         GEMMFunctionType gemm;
-        gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
+        gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
 
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
@@ -1645,7 +1650,8 @@ protected:
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
-- 
cgit v1.2.1