From 05288a2b871ef99f544771621c3bba409b2f70df Mon Sep 17 00:00:00 2001
From: Gian Marco <gianmarco.iodice@arm.com>
Date: Tue, 21 Nov 2017 10:57:50 +0000
Subject: COMPMID-697 - Rework GEMMLowp interface on OpenCL

Reworked the interface of GemmLowp in order to make easy the integration
in Android NN

- Added support for different output stage
- Added validation for both matrix multiplication and output stage
- Added bounded relu support in the output stage
- Added in32_t bias support
- Added optimized path for vector by matrix case

This rework is required for:
- Convolution quantized
- Fully connected quantized

Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
---
 arm_compute/core/CL/CLKernels.h                    |   3 +
 .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h    |  31 +-
 .../kernels/CLGEMMLowpOffsetContributionKernel.h   |  82 ++++
 ...CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h |  84 ++++
 .../core/CL/kernels/CLGEMMLowpReductionKernel.h    |  99 ++++
 .../kernels/NEGEMMLowpOffsetContributionKernel.h   |   4 +-
 ...NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h |   2 +-
 .../core/NEON/kernels/NEGEMMLowpReductionKernel.h  |   2 +-
 arm_compute/runtime/CL/CLFunctions.h               |   3 +-
 arm_compute/runtime/CL/functions/CLGEMMLowp.h      |  89 ----
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h    |  91 ++++
 .../runtime/CL/functions/CLGEMMLowpOutputStage.h   |  78 +++
 docs/00_introduction.dox                           |   2 +-
 src/core/CL/CLKernelLibrary.cpp                    |  11 +-
 src/core/CL/cl_kernels/gemm.cl                     | 104 ----
 src/core/CL/cl_kernels/gemmlowp.cl                 | 540 +++++++++++++++++++++
 .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp  |  99 ++--
 .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 162 +++++++
 ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 128 +++++
 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp  | 162 +++++++
 .../NEON/kernels/NEGEMMLowpReductionKernel.cpp     |   2 +-
 src/runtime/CL/functions/CLGEMMLowp.cpp            |  93 ----
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp  | 178 +++++++
 src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp |  37 ++
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     |  10 +-
 tests/datasets/LargeGEMMLowpDataset.h              |   2 +
 tests/datasets/SmallGEMMLowpDataset.h              |   4 +-
 tests/validation/CL/GEMMLowp.cpp                   | 172 +++++++
 tests/validation/NEON/GEMMLowp.cpp                 |  18 +-
 29 files changed, 1932 insertions(+), 360 deletions(-)
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
 delete mode 100644 arm_compute/runtime/CL/functions/CLGEMMLowp.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
 create mode 100644 src/core/CL/cl_kernels/gemmlowp.cl
 create mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
 create mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
 create mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
 delete mode 100644 src/runtime/CL/functions/CLGEMMLowp.cpp
 create mode 100644 src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
 create mode 100644 src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
 create mode 100644 tests/validation/CL/GEMMLowp.cpp

diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 8da0cecad5..e80df6aada 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -58,6 +58,9 @@
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
index 05956aeeba..b60b80618c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -30,15 +30,15 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel to compute low precision matrix multiplication kernel
+/** OpenCL kernel to multiply matrices
  *
+ * @note @ref CLGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
  *  This kernel performs the following computation:
- *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
- *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
- *  -# Compute the int32 matrix product of the resulting a * b.
- *  -# Add output_offset to each entry of the result.
- *  -# Multiply each entry of the result and round to the nearest integer
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ *  -# Convert a values from int8 to int32
+ *  -# Convert b values from int8 to int32
+ *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
  */
 class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
 {
@@ -55,19 +55,12 @@ public:
     CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel.
-     * These two kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data types supported: U8
-     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data types supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0
-     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-     * @param[in]  output_offset   Offset to be added to each element of the output matrix
-     * @param[in]  output_mult_int Offset to be added to each element of the output matrix
-     * @param[in]  shift           Number of bits to shift right the result.
+     * @param[in]  input0                    Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
+     * @param[in]  input1                    Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
      */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
new file mode 100644
index 0000000000..5f2e025687
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class CLGEMMLowpOffsetContributionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpOffsetContributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in, out] mm_result      Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      k              Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
+     */
+    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_vector_sum_col;
+    const ICLTensor *_vector_sum_row;
+    ICLTensor       *_mm_result;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
new file mode 100644
index 0000000000..75f3750714
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input           Input tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_offset   Offset to be added to each element of the input matrix
+     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ */
\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
new file mode 100644
index 0000000000..aa0583fe81
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Common interface for all OpenCL reduction kernels */
+class ICLGEMMLowpReductionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    ICLGEMMLowpReductionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data type supported: S8
+     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     */
+    virtual void configure(const ICLTensor *input, ICLTensor *output) = 0;
+
+protected:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8
+     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     */
+    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) override;
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     */
+    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) override;
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 27cb3f2c1c..989260de11 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
 {
 class ITensor;
 
-/* NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
  *
- * This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel),
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
  * and adds to it the offset contribution of matrix A and matrix B in-place.
  *
  * The final result is:
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
index 7684350c0f..a522069330 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/* NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
  *
  * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
  * The following computations will be performed by the kernel:
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 9ca5cdf828..50d8b4070e 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -47,7 +47,7 @@ public:
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input       Input tensor. Data type supported: S8
+     * @param[in]  input       Input tensor. Data type supported: QASYMM8
      * @param[out] output      Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
      * @param[in]  k           Number of matrix A columns (or matrix B rows)
      * @param[in]  is_reshaped True if the input tensor has been reshaped
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 360372d192..9a20769ca1 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -59,7 +59,8 @@
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
deleted file mode 100644
index ffd997f6ec..0000000000
--- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLGEMMLOWP_H__
-#define __ARM_COMPUTE_CLGEMMLOWP_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
-#include "arm_compute/runtime/CL/CLMemoryGroup.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute GEMMLowp on OpenCL. This function calls the following OpenCL kernels:
-*
-*  -# @ref CLGEMMInterleave4x4Kernel
-*  -# @ref CLGEMMTranspose1xWKernel
-*  -# @ref CLGEMMLowpMatrixMultiplyKernel
-*
-*/
-class CLGEMMLowp : public IFunction
-{
-public:
-    /** Constructor */
-    CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the kernel's inputs, output
-     *
-     * @note GEMM_LOWP:  low precision matrix multiply kernel
-     *  This kernel performs the following computation:
-     *
-     *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
-     *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
-     *  -# Compute the int32 matrix product of the resulting a * b.
-     *  -# Add output_offset to each entry of the result.
-     *  -# Multiply each entry of the result and round to the nearest integer
-     *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
-     *
-     * @param[in]  a               First input tensor  (Matrix A). Data types supported: U8.
-     * @param[in]  b               Second input tensor (Matrix B). Data types supported: same as @p a.
-     * @param[out] output          Output tensor. Data types supported: same as @p a.
-     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-     * @param[in]  output_offset   Offset to be added to each element of the output matrix
-     * @param[in]  output_mult_int Multiplied with each element of the output matrix
-     * @param[in]  shift           Number of bits to shift right the result.
-     */
-    void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLMemoryGroup                  _memory_group;
-    CLGEMMInterleave4x4Kernel      _interleave_kernel;
-    CLGEMMTranspose1xWKernel       _transpose_kernel;
-    CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
-    CLTensor                       _tmp_a;
-    CLTensor                       _tmp_b;
-};
-}
-#endif /*__ARM_COMPUTE_CLGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..9944afeac7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IMemoryManager;
+class ICLTensor;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLGEMMInterleave4x4Kernel  (if the output tensor is a matrix)
+ *  -# @ref CLGEMMTranspose1xWKernel  (if the output tensor is a matrix)
+ *  -# @ref CLGEMMLowpMatrixMultiplyKernel
+ *  -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
+ *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
+ *  -# @ref CLGEMMLowpOffsetContributionKernel
+ *
+*/
+class CLGEMMLowpMatrixMultiplyCore : public IFunction
+{
+public:
+    /** Constructor */
+    CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Initialise the kernel's inputs, output
+     *
+     * @note GEMM_LOWP:  low precision GEMM kernel
+     *  This kernel performs the following computations:
+     *
+     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+     *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+     *  -# Compute the matrix product of the resulting a * b in int32.
+     *
+     * @param[in]  a      First input tensor  (Matrix A). Data type supported: QASYMM8.
+     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[out] output Output tensor. Data type supported: Data type supported: S32
+     */
+    void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMemoryGroup                      _memory_group;
+    CLGEMMLowpMatrixMultiplyKernel     _mm_kernel;
+    CLGEMMInterleave4x4Kernel          _mtx_a_reshape_kernel;
+    CLGEMMTranspose1xWKernel           _mtx_b_reshape_kernel;
+    CLGEMMLowpMatrixAReductionKernel   _mtx_a_reduction_kernel;
+    CLGEMMLowpMatrixBReductionKernel   _mtx_b_reduction_kernel;
+    CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+    CLTensor                           _vector_sum_col;
+    CLTensor                           _vector_sum_row;
+    CLTensor                           _tmp_a;
+    CLTensor                           _tmp_b;
+    int32_t                            _a_offset;
+    int32_t                            _b_offset;
+    bool                               _is_interleaved_transposed;
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
new file mode 100644
index 0000000000..5c176a960b
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+/** This file contains all available output stages for GEMMLowp on OpenCL.
+ *
+ *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
+ *  and processes it to obtain the final ASYMM8 value.
+ *
+ *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8Scale on OpenCL.
+ *
+ *  CLGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
+ *  The final result is:
+ *
+ *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ *  ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift
+ *
+ *  This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ *       before the result is shifted right by result_shift
+*/
+class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input           Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_offset   Offset to be added to each element of the input matrix
+     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ */
\ No newline at end of file
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index b5a1d59f6a..cc12897278 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -253,7 +253,7 @@ v17.03.1 First Major public release of the sources
  - New CPP target introduced for C++ kernels shared between NEON and CL functions.
  - New padding calculation interface introduced and ported most kernels / functions to use it.
  - New OpenCL kernels / functions:
-   - @ref arm_compute::CLGEMMLowpMatrixMultiplyKernel / @ref arm_compute::CLGEMMLowp
+   - @ref arm_compute::CLGEMMLowpMatrixMultiplyKernel / arm_compute::CLGEMMLowp
  - New NEON kernels / functions:
    - @ref arm_compute::NENormalizationLayerKernel / @ref arm_compute::NENormalizationLayer
    - @ref arm_compute::NETransposeKernel / @ref arm_compute::NETranspose
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6cc5a9a6b5..948fe441cf 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -218,7 +218,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gemm_ma_qs8", "gemm.cl" },
     { "gemm_ma_qs16", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
-    { "gemm_mm_interleaved_transposed_u8", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
@@ -233,6 +232,12 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gemm_transpose1x16", "gemm.cl" },
     { "gemm_transpose1x8", "gemm.cl" },
     { "gemm_transpose1x4", "gemm.cl" },
+    { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
+    { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
+    { "gemmlowp_mm", "gemmlowp.cl" },
+    { "gemmlowp_mm_interleaved_transposed", "gemmlowp.cl" },
+    { "gemmlowp_offset_contribution", "gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
     { "harris_score_3x3", "harris_corners.cl" },
     { "harris_score_5x5", "harris_corners.cl" },
     { "harris_score_7x7", "harris_corners.cl" },
@@ -480,6 +485,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
     {
         "gemm.cl",
 #include "./cl_kernels/gemm.clembed"
+    },
+    {
+        "gemmlowp.cl",
+#include "./cl_kernels/gemmlowp.clembed"
     },
     {
         "gemv.cl",
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 15111ed352..c763cb355b 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -251,110 +251,6 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
 }
 
 #if defined(COLS_B)
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
- *
- * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  a_offset                           Offset to be added to each element of the matrix A
- * @param[in]  b_offset                           Offset to be added to each element of the matrix B.
- * @param[in]  c_offset                           Offset to be added to each element of the matrix C.
- * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
- * @param[in]  shift                              Number of bits to shift right the result.
- */
-__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
-                                                IMAGE_DECLARATION(src1),
-                                                IMAGE_DECLARATION(dst),
-                                                int a_offset,
-                                                int b_offset,
-                                                int c_offset,
-                                                int c_mult_int,
-                                                int shift)
-{
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
-
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
-
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
-
-    // Reset accumulators
-    int16 c00 = 0.0f;
-    int16 c10 = 0.0f;
-    int16 c20 = 0.0f;
-    int16 c30 = 0.0f;
-
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        int8 a0  = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
-
-        int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
-
-        c00 += (int16)a0.s4 * b1;
-        c10 += (int16)a0.s5 * b1;
-        c20 += (int16)a0.s6 * b1;
-        c30 += (int16)a0.s7 * b1;
-    }
-
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        int4 a0  = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Multiply by the weight of matrix product
-    c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
-    c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
-    c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
-    c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
-
-    // Store 4x16 block
-    vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
-    vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
-    vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
-    vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
-}
-
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
new file mode 100644
index 0000000000..7cd0c0b8db
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(COLS_B)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
+ *
+ * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm_interleaved_transposed(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+                                                 IMAGE_DECLARATION(dst))
+{
+    // src_addr.s0 = address of matrix A
+    // src_addr.s1 = address of matrix B
+    // Compute address for matrix A and B
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    // Add offset_first_element_in_bytes
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Compute end row address for matrix B
+    int end_row_mtx_b = src_addr.s1 + COLS_B;
+
+    // Reset accumulators
+    int16 c00 = 0;
+    int16 c10 = 0;
+    int16 c20 = 0;
+    int16 c30 = 0;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 32); src_addr += (int2)(8, 32))
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        int8 a0  = convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+
+        int16 b1 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
+
+        c00 += (int16)a0.s4 * b1;
+        c10 += (int16)a0.s5 * b1;
+        c20 += (int16)a0.s6 * b1;
+        c30 += (int16)a0.s7 * b1;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        int4 a0  = convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store 4x16 block
+    vstore16(c00, 0, (__global int *)(offset(&dst, 0, 0)));
+    vstore16(c10, 0, (__global int *)(offset(&dst, 0, 1)));
+    vstore16(c20, 0, (__global int *)(offset(&dst, 0, 2)));
+    vstore16(c30, 0, (__global int *)(offset(&dst, 0, 3)));
+}
+#endif // defined(COLS_B)
+
+#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx;
+
+    int end_row_vec_a = src_addr.s0 + COLS_A;
+
+    VECTOR_UINT acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VECTOR_UINT acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VECTOR_UINT acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VECTOR_UINT acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+    {
+        // Load values from matrix A
+        uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
+        VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
+
+        // Accumulate
+        acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
+        acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
+        acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
+        acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
+        acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+    {
+        // Load values from matrix A
+        uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
+
+        // Accumulate
+        acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the result
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint4 sum_row_u32 = (uint4)0;
+    uint  sum_row     = 0;
+
+    __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 16); i += 16)
+    {
+        const uchar16 a0_u8 = vload16(0, matrix_a + i);
+
+        sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += matrix_a[i];
+    }
+
+    sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
+
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint16 sum_col_u32 = (uint16)0;
+
+    __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+    // This for loop performs 4 accumulations
+    for(; i <= ((int)ROWS_B - 4); i += 4)
+    {
+        const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
+        const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
+        const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
+        const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
+
+        sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
+
+        matrix_b += 4 * src_stride_y;
+    }
+
+    // This for loop perfoms the leftover accumulations
+    for(; i < (int)ROWS_B; ++i)
+    {
+        const uchar16 b0_u8 = vload16(0, matrix_b);
+
+        sum_col_u32 += convert_uint16(b0_u8);
+
+        matrix_b += src_stride_y;
+    }
+
+    vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#if defined(K_OFFSET)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                                Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x                           Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x                             mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                           Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y                             mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z                             mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes      The offset of the first element in the source tensor
+ * @param[in] sum_col_result_ptr                           Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_result_step_x                        sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_result_step_y                        sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_row_result_ptr                           Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_result_step_x                        sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_result_step_y                        sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                          )
+{
+    Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
+
+    int16 a_offset_s32 = (int16)0;
+    int16 b_offset_s32 = (int16)0;
+
+#if defined(A_OFFSET)
+    Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
+
+    // Compute the offset contribution due to A_OFFSET
+    a_offset_s32 = vload16(0, (__global int *)sum_col.ptr + get_global_id(2) * sum_col_stride_y);
+    a_offset_s32 *= (int16)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
+
+    // Compute the offset contribution due to B_OFFSET
+    b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
+    b_offset_s32 *= (int16)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+    const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;
+
+    int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // Store the result with the offset contribution
+    vstore16(in_s32, 0, (__global int *)mm_result.ptr);
+}
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#if defined(ADD_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+#endif // defined(ADD_BIAS)
+
+    int16 input_values = vload16(0, (__global int *)src.ptr);
+
+    // Add the offset terms to GEMM's result
+    input_values += (int16)RESULT_OFFSET;
+
+    // Multiply by result_mult_int
+    input_values *= (int16)RESULT_MULT_INT;
+
+#if defined(ADD_BIAS)
+    // Add bias
+    const int16 biases_values = vload16(0, (__global int *)biases.ptr);
+    input_values += (int16)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Shift final result
+    input_values >>= RESULT_SHIFT;
+
+    // Saturate negative values
+    input_values = max(input_values, (int16)0);
+
+    uchar16 res = convert_uchar16_sat(input_values);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore16(res, 0, dst.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index ef572cfc7e..b3227c0db9 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -51,45 +51,88 @@ CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
 {
 }
 
-void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    TensorShape in1_shape = input1->info()->tensor_shape();
+    in1_shape.collapse(2);
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
-    // Create kernel and set static arguments
-    std::set<std::string> build_opts = { ("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))) };
-    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_u8", build_opts));
-    unsigned int idx                 = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<int32_t>(idx++, a_offset);
-    _kernel.setArg<int32_t>(idx++, b_offset);
-    _kernel.setArg<int32_t>(idx++, output_offset);
-    _kernel.setArg<int32_t>(idx++, output_mult_int);
-    _kernel.setArg<int32_t>(idx++, shift);
+    CLBuildOptions build_opts;
 
-    // Configure window
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
-    constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+    if(is_interleaved_transposed)
+    {
+        // Create kernel and set static arguments
+        build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options()));
 
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        // Configure window
+        constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+        constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+        constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
 
-    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
-    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    update_window_and_padding(win, input0_access, input1_access, output_access);
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
+        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+    else
+    {
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
+        constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+        const unsigned int     num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+
+        build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options()));
+
+        // Configure window
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic    input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
 
-    ICLKernel::configure(win);
+    // Set config_id for enabling LWS tuning
+    _config_id = "gemmlowp_";
+    _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
 void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -117,7 +160,7 @@ void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window.slide_window_slice_2D(slice));
 }
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000000..96919fe3cb
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+{
+}
+
+void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
+
+        TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+        vector_sum_col_shape.collapse(1);
+
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
+
+        TensorShape output_shape         = mm_result->info()->tensor_shape();
+        TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
+        vector_sum_row_shape.collapse(1);
+        output_shape.collapse(2);
+
+        ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+
+        if(a_offset != 0)
+        {
+            TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+            vector_sum_col_shape.collapse(1);
+
+            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
+                                     && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                     "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+        }
+
+        build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    }
+
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, mm_result_access);
+
+    if(a_offset != 0)
+    {
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, vector_sum_col_access);
+    }
+
+    if(b_offset != 0)
+    {
+        AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
+        update_window_and_padding(win, vector_sum_row_access);
+    }
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _mm_result, slice);
+        if(_vector_sum_col != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col);
+        }
+        if(_vector_sum_row != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
+        }
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
new file mode 100644
index 0000000000..fa6a48e77c
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min,
+                                                              int max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON(max > 255);
+    ARM_COMPUTE_ERROR_ON(min < 0 || min > max);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0));
+    }
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(result_offset));
+    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(result_mult_int));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
+    build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down", build_opts.options()));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_result_access);
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]);
+
+        update_window_and_padding(win,
+                                  bias_access);
+    }
+
+    output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(_bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, _bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx1, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
new file mode 100644
index 0000000000..6f410d3b14
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
+    : _input(), _output()
+{
+}
+
+void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+    _input  = mtx_a;
+    _output = vector_sum_row;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
+
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+    Window slice_in  = collapsed.first_slice_window_2D();
+    Window slice_out = collapsed.first_slice_window_2D();
+
+    // Setup input slice. Its dimensions are increased in the cl kernel.
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(collapsed.slide_window_slice_2D(slice_out));
+}
+
+void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+
+    _input  = mtx_b;
+    _output = vector_sum_col;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
+    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options()));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+    Window slice_out = collapsed.first_slice_window_2D();
+    Window slice_in  = slice_out;
+
+    slice_in.set(Window::DimY, Window::Dimension(0, 1, 1));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(collapsed.slide_window_slice_2D(slice_out));
+}
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index a8395a15cb..81094f8743 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -209,7 +209,7 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
             uint32x4_t sum_row_u32 = vdupq_n_u32(0);
             uint32_t   sum_row     = 0;
 
-            const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]);
+            const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
deleted file mode 100644
index db6d11c2c3..0000000000
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
-{
-}
-
-void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-
-    // Create shape for interleaved temporary tensor
-    TensorShape shape_tmp_a = a->info()->tensor_shape();
-    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-    shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4));
-    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-    _tmp_a.allocator()->init(info_a);
-
-    // Create shape for tranposed temporary tensor
-    TensorShape shape_tmp_b = b->info()->tensor_shape();
-    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-    shape_tmp_b.set(1, std::ceil(static_cast<float>(b->info()->dimension(0)) / 16));
-    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-    _tmp_b.allocator()->init(info_b);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp_a);
-    _memory_group.manage(&_tmp_b);
-
-    // Configure kernels
-    _interleave_kernel.configure(a, &_tmp_a);
-    _transpose_kernel.configure(b, &_tmp_b);
-    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
-
-    // Allocate intermediate buffers
-    _tmp_a.allocator()->allocate();
-    _tmp_b.allocator()->allocate();
-}
-
-void CLGEMMLowp::run()
-{
-    _memory_group.acquire();
-
-    /* Run interleave kernel */
-    CLScheduler::get().enqueue(_interleave_kernel, false);
-
-    /* Run transpose kernel */
-    CLScheduler::get().enqueue(_transpose_kernel, false);
-
-    /* Run matrix multiply kernel */
-    CLScheduler::get().enqueue(_mm_kernel, false);
-
-    _memory_group.release();
-}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..5d2d13e243
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+      _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+
+    _a_offset = a->info()->quantization_info().offset;
+    _b_offset = b->info()->quantization_info().offset;
+
+    // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
+    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+    const ICLTensor *matrix_a = a;
+    const ICLTensor *matrix_b = b;
+
+    if(_is_interleaved_transposed)
+    {
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
+
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        _tmp_a.allocator()->init(info_a);
+        _tmp_b.allocator()->init(info_b);
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
+        // Configure interleave kernel
+        _mtx_a_reshape_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+    }
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed);
+
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+        if(b->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_col.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+        _vector_sum_col.allocator()->init(info_vector_sum_col);
+        _memory_group.manage(&_vector_sum_col);
+
+        // Configure Matrix B reduction kernel
+        _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+        if(a->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_row.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+        _vector_sum_row.allocator()->init(info_vector_sum_row);
+        _memory_group.manage(&_vector_sum_row);
+
+        // Configure matrix A reduction kernel
+        _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
+    }
+
+    // Configure offset contribution kernel
+    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+    // Allocate tensors
+    if(_is_interleaved_transposed)
+    {
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+
+    if(_a_offset != 0)
+    {
+        _vector_sum_col.allocator()->allocate();
+    }
+
+    if(_b_offset != 0)
+    {
+        _vector_sum_row.allocator()->allocate();
+    }
+}
+
+void CLGEMMLowpMatrixMultiplyCore::run()
+{
+    _memory_group.acquire();
+
+    if(_is_interleaved_transposed)
+    {
+        // Run reshape matrix A
+        CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
+
+        // Run reshape matrix B
+        CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+    }
+
+    // Run matrix multiply
+    CLScheduler::get().enqueue(_mm_kernel, false);
+
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+    }
+
+    // Run offset contribution kernel
+    CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000000..b1d620d8a2
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+    k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+    _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 92c911c370..da5ac22fdc 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -133,7 +133,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     if(_a_offset != 0)
     {
         TensorShape shape_vector_sum_col = b->info()->tensor_shape();
-        shape_vector_sum_col.remove_dimension(1);
+        if(b->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_col.remove_dimension(1);
+        }
         TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
         _vector_sum_col.allocator()->init(info_vector_sum_col);
         _memory_group.manage(&_vector_sum_col);
@@ -147,7 +150,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     {
         TensorShape shape_vector_sum_row = a->info()->tensor_shape();
         shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
-        shape_vector_sum_row.remove_dimension(1);
+        if(a->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_row.remove_dimension(1);
+        }
         TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
         _vector_sum_row.allocator()->init(info_vector_sum_row);
         _memory_group.manage(&_vector_sum_row);
diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h
index cc1feb49a2..87f879e70a 100644
--- a/tests/datasets/LargeGEMMLowpDataset.h
+++ b/tests/datasets/LargeGEMMLowpDataset.h
@@ -42,7 +42,9 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset
 public:
     LargeGEMMLowpDataset()
     {
+        add_config(TensorShape(923U, 2U), TensorShape(871U, 923U), TensorShape(871U, 2U), 0, 0);
         add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0);
+        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3);
         add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4);
         add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0);
         add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13);
diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h
index 881546e70f..1d4ab53be5 100644
--- a/tests/datasets/SmallGEMMLowpDataset.h
+++ b/tests/datasets/SmallGEMMLowpDataset.h
@@ -42,11 +42,13 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset
 public:
     SmallGEMMLowpDataset()
     {
+        add_config(TensorShape(21U, 2U), TensorShape(43U, 21U), TensorShape(43U, 2U), 0, 0);
         add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0);
+        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13);
         add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4);
         add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0);
         add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13);
-        add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 12U), -3, -2);
+        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2);
         add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1);
     }
 };
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
new file mode 100644
index 0000000000..1968efcedc
--- /dev/null
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LargeGEMMLowpDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallGEMMLowpDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/GEMMLowpFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(GEMMLowp)
+
+TEST_SUITE(MatrixMultiplyCore)
+using CLGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
+               shape_a, shape_b, shape_c, a_offset, b_offset)
+{
+    // Create tensors
+    CLTensor a = create_tensor<CLTensor>(shape_a, DataType::QASYMM8);
+    CLTensor b = create_tensor<CLTensor>(shape_b, DataType::QASYMM8);
+    CLTensor c = create_tensor<CLTensor>(shape_c, DataType::S32);
+
+    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
+    b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
+
+    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLGEMMLowpMatrixMultiplyCore gemmlowp_mm;
+    gemmlowp_mm.configure(&a, &b, &c);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END() // MatrixMultiplyCore
+
+TEST_SUITE(OutputStage)
+TEST_SUITE(QuantizeDownInt32ToUint8Scale)
+
+const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
+                                                      3)
+                                                      * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
+
+const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
+                                                           2)
+                                                           * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 173) * framework::dataset::make("addBias", { false, true });
+
+using CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToUint8Scale>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases),
+               shape, result_offset, result_mult_int, result_shift, min, max, add_bias)
+{
+    TensorShape shape_bias(shape[0]);
+
+    // Create tensors
+    CLTensor in   = create_tensor<CLTensor>(shape, DataType::S32);
+    CLTensor bias = create_tensor<CLTensor>(shape_bias, DataType::S32);
+    CLTensor out  = create_tensor<CLTensor>(shape, DataType::QASYMM8);
+
+    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLGEMMLowpQuantizeDownInt32ToUint8Scale output_stage;
+    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_offset, result_mult_int, result_shift, min, max);
+
+    // Validate valid region input and output
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(in.info()->valid_region(), valid_region);
+    validate(out.info()->valid_region(), valid_region);
+
+    // Validate valid region bias
+    if(add_bias)
+    {
+        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
+        validate(bias.info()->valid_region(), valid_region_bias);
+    }
+
+    // Validate padding
+    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
+    validate(in.info()->padding(), padding);
+    validate(out.info()->padding(), padding);
+
+    if(add_bias)
+    {
+        validate(bias.info()->padding(), padding);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE(BoundedReLu)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // BoundedReLu
+
+TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
+TEST_SUITE_END() // OutputStage
+TEST_SUITE_END() // GEMMLowp
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 6366223820..6d13fdc939 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -177,11 +177,11 @@ TEST_SUITE(OutputStage)
 
 TEST_SUITE(QuantizeDownInt32ToUint8Scale)
 
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
+const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
                                                       3)
                                                       * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
 
-const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1,
+const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
                                                            2)
                                                            * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
 
@@ -254,20 +254,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture,
 }
 TEST_SUITE_END() // BoundedReLu
 
-TEST_SUITE(AddBias)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // AddBias
-
 TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
 TEST_SUITE_END() // OutputStage
 
-- 
cgit v1.2.1