From e75a02b60736f37c34388c23c0ccee230f65da59 Mon Sep 17 00:00:00 2001
From: Gian Marco <gianmarco.iodice@arm.com>
Date: Wed, 8 Nov 2017 12:24:09 +0000
Subject: COMPMID-675 - Reworked NEGEMMLowp interface/function

The new interface makes NEGEMMLowp able to work with ASYMM8 data types.

Implemented 2 new functions:
- NEGEMMLowpMatrixMultiplyCore
- NEGEMMLowpOutputStage

These functions should make the integration in android NN doable

For more information about GEMMLowp:
https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

Change-Id: Ie2c775f45234f68ca53dba644b3a912b997fd890
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95504
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
---
 arm_compute/core/NEON/NEKernels.h                  |   3 +-
 .../core/NEON/kernels/NEGEMMLowpFinalizeKernel.h   | 103 -----
 .../NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h  |   4 +-
 .../kernels/NEGEMMLowpOffsetContributionKernel.h   |  79 ++++
 ...NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h |  78 ++++
 .../core/NEON/kernels/NEGEMMLowpReductionKernel.h  |  27 +-
 arm_compute/runtime/NEON/NEFunctions.h             |   2 +-
 arm_compute/runtime/NEON/functions/NEGEMMLowp.h    |  94 ----
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  |  33 +-
 .../runtime/NEON/functions/NEGEMMLowpOutputStage.h |  69 +++
 docs/00_introduction.dox                           |   2 +-
 .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp     |   3 +-
 src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp | 509 ---------------------
 .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp     | 128 +++---
 .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 338 ++++++++++++++
 ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 141 ++++++
 .../NEON/kernels/NEGEMMLowpReductionKernel.cpp     | 176 +++----
 src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp |   3 +-
 src/runtime/NEON/functions/NEGEMMLowp.cpp          | 134 ------
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     |  84 +++-
 .../NEON/functions/NEGEMMLowpOutputStage.cpp       |  37 ++
 tests/benchmark/NEON/GEMMLowp.cpp                  |   1 -
 tests/datasets/GEMMLowpDataset.h                   |  36 +-
 tests/datasets/LargeGEMMLowpDataset.h              |  12 +-
 tests/datasets/SmallGEMMLowpDataset.h              |  12 +-
 tests/validation/CPP/GEMMLowp.cpp                  |  67 +--
 tests/validation/CPP/GEMMLowp.h                    |   6 +-
 tests/validation/NEON/GEMMLowp.cpp                 |  93 +++-
 tests/validation/fixtures/GEMMLowpFixture.h        |  94 ++--
 29 files changed, 1171 insertions(+), 1197 deletions(-)
 delete mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
 delete mode 100644 arm_compute/runtime/NEON/functions/NEGEMMLowp.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
 delete mode 100644 src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp
 create mode 100644 src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
 create mode 100644 src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
 delete mode 100644 src/runtime/NEON/functions/NEGEMMLowp.cpp
 create mode 100644 src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp

diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 8dedf38b3e..d78cec2a62 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -61,8 +61,9 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h
deleted file mode 100644
index 8908fabc1e..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPFINALIZEKERNEL_H__
-#define __ARM_COMPUTE_NEGEMMLOWPFINALIZEKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/* NEON kernel used to finalize the GEMMLowp result
- *
- * This kernel performs the following computations:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result and round to nearest integer
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
- */
-class NEGEMMLowpFinalizeKernel : public INEKernel
-{
-public:
-    /** Constructor */
-    NEGEMMLowpFinalizeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpFinalizeKernel(const NEGEMMLowpFinalizeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpFinalizeKernel &operator=(const NEGEMMLowpFinalizeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpFinalizeKernel(NEGEMMLowpFinalizeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpFinalizeKernel &operator=(NEGEMMLowpFinalizeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input row-vectors  @p vector_sum_col and @p vector_sum_row must be the output of @ref NEGEMMLowpMatrixBReductionKernel and @ref NEGEMMLowpMatrixAReductionKernel kernels.
-     *       These 2 vectors are needed to handle the offset of matrix product
-     *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
-     *
-     * @param[in]  vector_sum_col Input row-vector of sums of all the entries in each column of input1.
-     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row Input row-vector of sums of all the entries in each row of input0.
-     *                            Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p vector_sum_col
-     * @param[in]  mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: same as @p vector_sum_col
-     * @param[out] output         Output tensor containing the result of GEMMLowP. Data type supported: S8
-     * @param[in]  num_mtx_a_cols Number of matrix A columns
-     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
-     * @param[in]  c_offset       Offset to be added to each element of the output matrix
-     * @param[in]  c_mult_int     Value to be multiplied to each entry of the result.
-     * @param[in]  shift          Number of bits to shift right the result.
-     */
-    void configure(const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *mm_result, ITensor *output, int32_t num_mtx_a_cols, int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                   int32_t c_mult_int, int32_t shift);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the finalize kernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool add_a_offset, bool add_b_offset>
-    void finalize(const Window &window);
-    using FinalizeFunctionPtr = void (NEGEMMLowpFinalizeKernel::*)(const Window &window);
-
-    FinalizeFunctionPtr _func;
-    const ITensor      *_vector_sum_col;
-    const ITensor      *_vector_sum_row;
-    const ITensor      *_mm_result;
-    ITensor            *_output;
-    int32_t             _a_offset;
-    int32_t             _b_offset;
-    int32_t             _c_offset;
-    int32_t             _k_offset;
-    int32_t             _c_mult_int;
-    int32_t             _shift;
-    bool                _slide_vector_sum_col;
-};
-} // namespace arm_compute
-
-#endif /* __ARM_COMPUTE_NEGEMMLOWPFINALIZEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index f145eb6ca3..e9bfe4ea07 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -58,8 +58,8 @@ public:
      * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
      * kernels change the layout of the original matrices to be more cache-friendly.
      *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: S8
-     * @param[in]  input1 Input tensor containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: ASYMM8
+     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
      * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
      */
     void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
new file mode 100644
index 0000000000..04b84339b0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/* NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ */
+class NEGEMMLowpOffsetContributionKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpOffsetContributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in, out] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      k              Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
+     */
+    void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_vector_sum_col;
+    const ITensor *_vector_sum_row;
+    ITensor       *_mm_result;
+    int32_t        _a_offset;
+    int32_t        _b_offset;
+    int32_t        _k_offset;
+    bool           _slide_vector_sum_col;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
new file mode 100644
index 0000000000..65f1042b9c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/* NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result and round to nearest integer
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+    * @param[in]  input           Input tensor. Data type supported: S32
+    * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+    * @param[in]  result_offset   Offset to be added to each element of the input matrix
+    * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+    * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+     */
+    void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    int32_t        _result_offset;
+    int32_t        _result_mult_int;
+    int32_t        _result_shift;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index a069969681..6eee54a9f0 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -45,10 +45,9 @@ public:
     /** Allow instances of this class to be moved */
     INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
 
-public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input       Input tensor containing the interleaved or transposed matrix. Data type supported: S8
+     * @param[in]  input       Input tensor. Data type supported: S8
      * @param[out] output      Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
      * @param[in]  k           Number of matrix A columns (or matrix B rows)
      * @param[in]  is_reshaped True if the input tensor has been reshaped
@@ -72,14 +71,12 @@ class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @note The input matrix @p mtx_a_interleaved4x4 must be the output of @ref NEGEMMInterleave4x4Kernel.
-     *
-     * @param[in]  mtx_a_interleaved4x4 Input tensor containing the interleaved Matrix A. Data type supported: U8
-     * @param[out] vector_sum_row       Output row-vector of sums of all the entries in each row of mtx_a_interleaved4x4. Data type supported: S32
-     * @param[in]  num_mtx_a_cols       Number of matrix A columns
-     * @param[in]  is_interleaved4x4    True if the input tensor is interleaved4x4
+     * @param[in]  mtx_a             Input tensor. Data type supported: QASYMM8
+     * @param[out] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  num_mtx_a_cols    Number of matrix A columns
+     * @param[in]  is_interleaved4x4 True if the matrix A has been interleaved4x4
      */
-    void configure(const ITensor *mtx_a_interleaved4x4, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override;
+    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override;
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -95,14 +92,12 @@ class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @note The input matrix @p mtx_b_transposed1xW must be the output of @ref NEGEMMTranspose1xWKernel kernel.
-     *
-     * @param[in]  mtx_b_transposed1xW Input tensor containing the transposed Matrix B. Data type supported: Data type supported: U8
-     * @param[out] vector_sum_col      Output row-vector of sums of all the entries in each column of mtx_b_transposed1xW. Data type supported: S32
-     * @param[in]  num_mtx_b_rows      Number of matrix B rows
-     * @param[in]  is_transposed1xW    True if the input tensor is transposed 1xW
+     * @param[in]  mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[out] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  num_mtx_b_rows   Number of matrix B rows
+     * @param[in]  is_transposed1xW True if the input tensor is transposed 1xW
      */
-    void configure(const ITensor *mtx_b_transposed1xW, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override;
+    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override;
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 563ade288a..118603b20b 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -60,8 +60,8 @@
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
deleted file mode 100644
index 59c919e161..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWP_H__
-#define __ARM_COMPUTE_NEGEMMLOWP_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute GEMMLowp on NEON. This function calls the following NEON kernels/function:
- *
- *  -# @ref NEGEMMLowpMatrixAReductionKernel
- *  -# @ref NEGEMMLowpMatrixBReductionKernel
- *  -# @ref NEGEMMLowpMatrixMultiplyCore
- *  -# @ref NEGEMMLowpFinalizeKernel
- *
-*/
-class NEGEMMLowp : public IFunction
-{
-public:
-    /** Constructor */
-    NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the kernel's inputs, output
-    *
-    * @note GEMM_LOWP:  low precision GEMM kernel
-    *  This kernel performs the following computations:
-    *
-    *  -# Convert a values from int8 to int32 and add a_offset to each of them.
-    *  -# Convert b values from int8 to int32 and add b_offset to each of them.
-    *  -# Compute the int32 matrix product of the resulting a * b.
-    *  -# Add output_offset to each entry of the result.
-    *  -# Multiply each entry of the result and round to the nearest integer
-    *  -# Clamp the resulting int32 values to the [0..255] range and cast to int8.
-    *
-    * @param[in]  a               First input tensor  (Matrix A). Data type supported: S8.
-    * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
-    * @param[out] output          Output tensor. Data type supported: same as @p a.
-    * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-    * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-    * @param[in]  c_offset        Offset to be added to each element of the output matrix
-    * @param[in]  output_mult_int Value to be multiplied to each element of the output matrix
-    * @param[in]  shift           Number of bits to shift right the result.
-    */
-    void configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t output_mult_int, int32_t shift);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                      _memory_group;
-    NEGEMMLowpMatrixMultiplyCore     _mm_func;
-    NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
-    NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
-    NEGEMMLowpFinalizeKernel         _finalize_kernel;
-    Tensor                           _vector_sum_col;
-    Tensor                           _vector_sum_row;
-    Tensor                           _mm_output;
-    int32_t                          _a_offset;
-    int32_t                          _b_offset;
-};
-}
-#endif /*__ARM_COMPUTE_NEGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index c81a432295..0c441df4b9 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -25,6 +25,8 @@
 #define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__
 
 #include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -41,11 +43,13 @@ class ITensor;
  *  -# @ref NEGEMMInterleave4x4Kernel
  *  -# @ref NEGEMMTranspose1xWKernel
  *  -# @ref NEGEMMLowpMatrixMultiplyKernel
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
  *
  * otherwise if the DOT product instruction is available:
  *
  *  -# @ref NEGEMMInterleaveBlockedKernel
  *  -# @ref NEGEMMLowpAArch64V8P4Kernel
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
  *
 */
 class NEGEMMLowpMatrixMultiplyCore : public IFunction
@@ -58,11 +62,11 @@ public:
     * @note GEMM_LOWP:  low precision GEMM kernel
     *  This kernel performs the following computations:
     *
-    *  -# Convert a values from uint8 to int32
-    *  -# Convert b values from uint8 to int32
-    *  -# Compute the int32 matrix product of the resulting a * b.
+    *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+    *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+    *  -# Compute the matrix product of the resulting a * b in int32.
     *
-    * @param[in]  a      First input tensor  (Matrix A). Data type supported: U8.
+    * @param[in]  a      First input tensor  (Matrix A). Data type supported: QASYMM8.
     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a
     * @param[out] output Output tensor. Data type supported: Data type supported: S32
     */
@@ -72,13 +76,20 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;
-    std::unique_ptr<INEKernel> _mm_kernel;
-    std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
-    std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
-    Tensor                     _tmp_a;
-    Tensor                     _tmp_b;
-    Tensor                     _workspace;
+    MemoryGroup                        _memory_group;
+    std::unique_ptr<INEKernel>         _mm_kernel;
+    std::unique_ptr<INEKernel>         _mtx_a_reshape_kernel;
+    std::unique_ptr<INEKernel>         _mtx_b_reshape_kernel;
+    NEGEMMLowpMatrixAReductionKernel   _mtx_a_reduction_kernel;
+    NEGEMMLowpMatrixBReductionKernel   _mtx_b_reduction_kernel;
+    NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+    Tensor                             _vector_sum_col;
+    Tensor                             _vector_sum_row;
+    Tensor                             _tmp_a;
+    Tensor                             _tmp_b;
+    Tensor                             _workspace;
+    int32_t                            _a_offset;
+    int32_t                            _b_offset;
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
new file mode 100644
index 0000000000..8557ef42e1
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__
+#define __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+/** This file contains all available output stages for GEMMLowp on NEON.
+ *
+ *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore),
+ *  and processes it to obtain the final ASYMM8 value.
+ *
+ *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8Scale on NEON.
+ *
+ *  NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
+ *  The final result is:
+ *
+ *  ((input[i][k] + result_offset) * result_mult_int + rounding) >> result_shift
+ *
+ *  where rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1))
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+ *
+*/
+class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+    *
+    * @param[in]  input           Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+    * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+    * @param[in]  result_offset   Offset to be added to each element of the input matrix
+    * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+    * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+    */
+    void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift);
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */
\ No newline at end of file
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 72a0ede29a..696364373d 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -260,7 +260,7 @@ v17.03.1 First Major public release of the sources
    - @ref arm_compute::NELogits1DMaxKernel, @ref arm_compute::NELogits1DShiftExpSumKernel, @ref arm_compute::NELogits1DNormKernel / @ref arm_compute::NESoftmaxLayer
    - @ref arm_compute::NEIm2ColKernel, @ref arm_compute::NECol2ImKernel, arm_compute::NEConvolutionLayerWeightsReshapeKernel / @ref arm_compute::NEConvolutionLayer
    - @ref arm_compute::NEGEMMMatrixAccumulateBiasesKernel / @ref arm_compute::NEFullyConnectedLayer
-   - @ref arm_compute::NEGEMMLowpMatrixMultiplyKernel / @ref arm_compute::NEGEMMLowp
+   - @ref arm_compute::NEGEMMLowpMatrixMultiplyKernel / arm_compute::NEGEMMLowp
 
 v17.03 Sources preview
  - New OpenCL kernels / functions:
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index ae5d456141..a29b661a00 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -132,7 +132,8 @@ NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
 
 void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                  DataType::F16,
                                                   DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
diff --git a/src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp
deleted file mode 100644
index 255e486365..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpFinalizeKernel.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-template <bool add_a_offset, bool add_b_offset>
-void NEGEMMLowpFinalizeKernel::finalize(const Window &window)
-{
-    const int32x4_t c_offset_s32 = vdupq_n_s32(_c_offset);
-    const int32x4_t shift_s32    = vdupq_n_s32(-_shift);
-
-    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ);
-
-    if(add_a_offset && add_b_offset) // true, true
-    {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        if(!_slide_vector_sum_col)
-        {
-            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-        }
-
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
-        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
-        Iterator mm_result(_mm_result, window);
-        Iterator out(_output, window);
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Compute the leftover term due to a_offset.
-            int32x4x4_t a_offset_term_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
-                }
-            };
-
-            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
-            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
-            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
-            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
-
-            // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
-            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
-
-            // Add a_offset_term_s32 and b_offset_term_s32
-            int32x4x4_t offset_term_s32 =
-            {
-                {
-                    vdupq_n_s32(_k_offset),
-                    vdupq_n_s32(_k_offset),
-                    vdupq_n_s32(_k_offset),
-                    vdupq_n_s32(_k_offset)
-                }
-            };
-
-            offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
-            offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
-            offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
-            offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
-
-            // Add c_offset
-            offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], c_offset_s32);
-            offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], c_offset_s32);
-            offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], c_offset_s32);
-            offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], c_offset_s32);
-
-            int32x4x4_t in_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-            // Multiply by c_mult_int
-            in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int);
-            in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int);
-            in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int);
-            in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int);
-
-            // Shift final result (negative value shift right)
-            in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32);
-            in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32);
-            in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32);
-            in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32);
-
-            // Convert S32 to U16
-            const int16x8x2_t in_s16 =
-            {
-                {
-                    vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-                    vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])),
-                }
-            };
-
-            // Convert S16 to S8
-            const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-            vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), out_s8);
-        },
-        vector_sum_col, vector_sum_row, mm_result, out);
-    }
-    else if(!add_a_offset && add_b_offset) // false, true
-    {
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
-        Iterator mm_result(_mm_result, window);
-        Iterator out(_output, window);
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
-            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
-
-            // Add b_offset_term_s32 and c_offset_term_s32
-            int32x4_t offset_term_s32 = vaddq_s32(b_offset_term_s32, c_offset_s32);
-
-            int32x4x4_t in_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32);
-
-            // Multiply by c_mult_int
-            in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int);
-            in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int);
-            in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int);
-            in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int);
-
-            // Shift final result (negative value shift right)
-            in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32);
-            in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32);
-            in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32);
-            in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32);
-
-            // Convert S32 to U16
-            const int16x8x2_t in_s16 =
-            {
-                {
-                    vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-                    vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])),
-                }
-            };
-
-            // Convert S16 to S8
-            const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-            vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), out_s8);
-        },
-        vector_sum_row, mm_result, out);
-    }
-    else if(add_a_offset && !add_b_offset) // true, false
-    {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        if(!_slide_vector_sum_col)
-        {
-            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-        }
-
-        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
-        Iterator mm_result(_mm_result, window);
-        Iterator out(_output, window);
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Compute the leftover term due to a_offset.
-            int32x4x4_t a_offset_term_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
-                }
-            };
-
-            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
-            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
-            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
-            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
-
-            // Add a_offset_term_s32 and b_offset_term_s32
-            int32x4x4_t offset_term_s32 =
-            {
-                {
-                    vaddq_s32(c_offset_s32, a_offset_term_s32.val[0]),
-                    vaddq_s32(c_offset_s32, a_offset_term_s32.val[1]),
-                    vaddq_s32(c_offset_s32, a_offset_term_s32.val[2]),
-                    vaddq_s32(c_offset_s32, a_offset_term_s32.val[3])
-                }
-            };
-
-            int32x4x4_t in_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-            // Multiply by c_mult_int
-            in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int);
-            in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int);
-            in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int);
-            in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int);
-
-            // Shift final result (negative value shift right)
-            in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32);
-            in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32);
-            in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32);
-            in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32);
-
-            // Convert S32 to S16
-            const int16x8x2_t in_s16 =
-            {
-                {
-                    vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-                    vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-                }
-            };
-
-            // Convert S16 to S8
-            const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-            vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), out_s8);
-        },
-        vector_sum_col, mm_result, out);
-    }
-    else // false, false
-    {
-        Iterator mm_result(_mm_result, window);
-        Iterator out(_output, window);
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            int32x4x4_t in_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], c_offset_s32);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], c_offset_s32);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], c_offset_s32);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], c_offset_s32);
-
-            // Multiply by c_mult_int
-            in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _c_mult_int);
-            in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _c_mult_int);
-            in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _c_mult_int);
-            in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _c_mult_int);
-
-            // Shift final result (negative value shift right)
-            in_s32.val[0] = vshlq_s32(in_s32.val[0], shift_s32);
-            in_s32.val[1] = vshlq_s32(in_s32.val[1], shift_s32);
-            in_s32.val[2] = vshlq_s32(in_s32.val[2], shift_s32);
-            in_s32.val[3] = vshlq_s32(in_s32.val[3], shift_s32);
-
-            // Convert S32 to S16
-            const int16x8x2_t in_s16 =
-            {
-                {
-                    vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-                    vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-                }
-            };
-
-            // Convert U16 to S8
-            const int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-            vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), out_s8);
-        },
-        mm_result, out);
-    }
-}
-
-NEGEMMLowpFinalizeKernel::NEGEMMLowpFinalizeKernel()
-    : _func(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _c_offset(0), _k_offset(0), _c_mult_int(0), _shift(0),
-      _slide_vector_sum_col(true)
-{
-}
-
-void NEGEMMLowpFinalizeKernel::configure(const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *mm_result, ITensor *output, int32_t num_mtx_a_cols, int32_t a_offset,
-                                         int32_t b_offset,
-                                         int32_t c_offset, int32_t c_mult_int, int32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
-
-    TensorShape mm_result_shape = mm_result->info()->tensor_shape();
-    TensorShape output_shape    = output->info()->tensor_shape();
-
-    mm_result_shape.collapse(2);
-    output_shape.collapse(2);
-
-    ARM_COMPUTE_ERROR_ON_MSG(mm_result_shape[2] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
-
-        TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
-        vector_sum_col_shape.collapse(1);
-
-        // Check if vector_sum_col_shape should be slidden or not
-        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
-
-        TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
-        vector_sum_row_shape.collapse(1);
-
-        ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
-
-        if(a_offset != 0)
-        {
-            TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
-            vector_sum_col_shape.collapse(1);
-
-            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
-                                     && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                     "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-        }
-    }
-
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _mm_result      = mm_result;
-    _output         = output;
-    _a_offset       = a_offset;
-    _b_offset       = b_offset;
-    _k_offset       = a_offset * b_offset * num_mtx_a_cols;
-    _c_offset       = c_offset;
-    _c_mult_int     = c_mult_int;
-    _shift          = shift;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    // Accordingly with a_offset and b_offset, we can have 4 cases:
-    // a_offset != 0 && b_offset != 0
-    // a_offset  = 0 && b_offset != 0
-    // a_offset != 0 && b_offset  = 0
-    // a_offset  = 0 && b_offset  = 0
-    if(a_offset != 0 && b_offset != 0)
-    {
-        // Set the function to use
-        _func = &NEGEMMLowpFinalizeKernel::finalize<true, true>;
-
-        AccessWindowStatic     vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
-
-        update_window_and_padding(win,
-                                  vector_sum_col_access,
-                                  vector_sum_row_access,
-                                  mm_result_access,
-                                  output_result_access);
-    }
-    else if(a_offset == 0 && b_offset != 0)
-    {
-        // Set the function to use
-        _func = &NEGEMMLowpFinalizeKernel::finalize<false, true>;
-
-        AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
-
-        update_window_and_padding(win,
-                                  vector_sum_row_access,
-                                  mm_result_access,
-                                  output_result_access);
-    }
-    else if(a_offset != 0 && b_offset == 0)
-    {
-        // Set the function to use
-        _func = &NEGEMMLowpFinalizeKernel::finalize<true, false>;
-
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
-
-        update_window_and_padding(win,
-                                  vector_sum_col_access,
-                                  mm_result_access,
-                                  output_result_access);
-    }
-    else
-    {
-        // Set the function to use
-        _func = &NEGEMMLowpFinalizeKernel::finalize<false, false>;
-
-        update_window_and_padding(win,
-                                  mm_result_access,
-                                  output_result_access);
-    }
-
-    output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEGEMMLowpFinalizeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window);
-}
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 4b9c9f3e64..1352f34e3c 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -52,7 +52,7 @@ NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
 
 void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
@@ -127,115 +127,115 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+        const uint8_t *mtx_a0 = ina.ptr();
+        const uint8_t *mtx_b0 = inb.ptr();
 
         // Note: Since the input are all positives, we can use uint32_t
         // Accumulators for the block 0
-        int32x4x4_t c0 =
+        uint32x4x4_t c0 =
         {
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
             }
         };
 
         // Accumulators for the block 1
-        int32x4x4_t c1 =
+        uint32x4x4_t c1 =
         {
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
             }
         };
 
         // Accumulators for the block 2
-        int32x4x4_t c2 =
+        uint32x4x4_t c2 =
         {
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
             }
         };
 
         // Accumulators for the block 3
-        int32x4x4_t c3 =
+        uint32x4x4_t c3 =
         {
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
             }
         };
 
         for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
         {
-            const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
-            const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
+            const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
+            const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
 
             // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
 
-            // Convert b00_s8 to int16_t
-            const int16x4x4_t b00_s16 =
+            // Convert b00_s8 to uint16_t
+            const uint16x4x4_t b00_u16 =
             {
                 {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
                 }
             };
 
             // 4x4 block 0
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
 
             // 4x4 block 1
-            c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
-            c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
-            c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
-            c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
+            c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
+            c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
+            c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
+            c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
 
             // 4x4 block 2
-            c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
-            c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
-            c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
-            c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
+            c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
+            c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
+            c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
+            c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
 
             // 4x4 block 3
-            c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
-            c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
-            c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
-            c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
+            c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
+            c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
+            c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
+            c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
         }
 
         auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
-        vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
-        vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
-        vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
-        vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
-        vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
-        vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
-        vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
-        vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
-        vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
-        vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
-        vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
-        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
-        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
-        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
-        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+        vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+        vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+        vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+        vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+        vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+        vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+        vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+        vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+        vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+        vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+        vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+        vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
     },
     ina, inb, out);
 }
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000000..bd550db54c
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
+{
+}
+
+void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
+
+        TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+        vector_sum_col_shape.collapse(1);
+
+        // Check if vector_sum_col_shape should be slidden or not
+        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
+
+        TensorShape output_shape         = mm_result->info()->tensor_shape();
+        TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
+        vector_sum_row_shape.collapse(1);
+        output_shape.collapse(2);
+
+        ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+
+        if(a_offset != 0)
+        {
+            TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+            vector_sum_col_shape.collapse(1);
+
+            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
+                                     && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                     "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+        }
+    }
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
+    _a_offset       = a_offset;
+    _b_offset       = b_offset;
+    _k_offset       = a_offset * b_offset * k;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
+
+    // Accordingly with a_offset and b_offset, we can have 4 cases:
+    // a_offset != 0 && b_offset != 0
+    // a_offset  = 0 && b_offset != 0
+    // a_offset != 0 && b_offset  = 0
+    // a_offset  = 0 && b_offset  = 0
+    if(a_offset != 0 && b_offset != 0)
+    {
+        AccessWindowStatic     vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
+
+        update_window_and_padding(win,
+                                  vector_sum_col_access,
+                                  vector_sum_row_access,
+                                  mm_result_access);
+    }
+    else if(a_offset == 0 && b_offset != 0)
+    {
+        AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
+
+        update_window_and_padding(win,
+                                  vector_sum_row_access,
+                                  mm_result_access);
+    }
+    else if(a_offset != 0 && b_offset == 0)
+    {
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
+
+        update_window_and_padding(win,
+                                  vector_sum_col_access,
+                                  mm_result_access);
+    }
+    else
+    {
+        update_window_and_padding(win,
+                                  mm_result_access);
+    }
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ);
+
+    if(_a_offset != 0 && _b_offset != 0) // true, true
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        if(!_slide_vector_sum_col)
+        {
+            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        }
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
+        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
+        Iterator mm_result(_mm_result, window);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Compute the leftover term due to a_offset.
+            int32x4x4_t a_offset_term_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
+                }
+            };
+
+            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
+            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
+            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
+            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
+
+            // Compute the leftover term due to b_offset.
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
+            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
+
+            // Add a_offset_term_s32 and b_offset_term_s32
+            int32x4x4_t offset_term_s32 =
+            {
+                {
+                    vdupq_n_s32(_k_offset),
+                    vdupq_n_s32(_k_offset),
+                    vdupq_n_s32(_k_offset),
+                    vdupq_n_s32(_k_offset)
+                }
+            };
+
+            offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
+            offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
+            offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
+            offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_col, vector_sum_row, mm_result);
+    }
+    else if((_a_offset == 0) && (_b_offset != 0)) // false, true
+    {
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
+        Iterator mm_result(_mm_result, window);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Compute the leftover term due to b_offset.
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
+            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_row, mm_result);
+    }
+    else if((_a_offset != 0) && (_b_offset == 0)) // true, false
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        if(!_slide_vector_sum_col)
+        {
+            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        }
+
+        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
+        Iterator mm_result(_mm_result, window);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Compute the leftover term due to a_offset.
+            int32x4x4_t a_offset_term_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
+                }
+            };
+
+            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
+            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
+            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
+            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_col, mm_result);
+    }
+    else // false, false
+    {
+        // No offset contribution from matrix A and matrix B
+        return;
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
new file mode 100644
index 0000000000..aa3c280788
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
+    : _input(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0)
+{
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+
+    _input           = input;
+    _output          = output;
+    _result_offset   = result_offset;
+    _result_mult_int = result_mult_int;
+    _result_shift    = result_shift;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_result_access);
+
+    output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset);
+    const int32x4_t result_shift_s32  = vdupq_n_s32(-_result_shift);
+    const int32x4_t zero_s32          = vdupq_n_s32(0);
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        int32x4x4_t in_s32 =
+        {
+            {
+                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
+                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
+                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
+                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
+            }
+        };
+
+        // Add the offset terms to GEMM's result
+        in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
+        in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
+        in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
+        in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
+
+        // Multiply by c_mult_int
+        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _result_mult_int);
+        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _result_mult_int);
+        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _result_mult_int);
+        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _result_mult_int);
+
+        // Shift final result (negative value shift right)
+        in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+        in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+        in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+        in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+        // Saturate negative values
+        in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+        in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+        in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+        in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+        // Convert S32 to S16
+        const int16x8x2_t in_s16 =
+        {
+            {
+                vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+            }
+        };
+
+        // Convert S16 to U8
+        const uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+        vst1q_u8(out.ptr(), out_u8);
+    },
+    in, out);
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 9df13ce0e3..81d9b5bb81 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -49,12 +49,12 @@ INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
 {
 }
 
-void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a_interleaved4x4, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
+void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a_interleaved4x4, 1, DataType::S8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
-    _input       = mtx_a_interleaved4x4;
+    _input       = mtx_a;
     _output      = vector_sum_row;
     _k           = num_mtx_a_cols;
     _is_reshaped = is_interleaved4x4;
@@ -97,9 +97,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            int32x4_t sum_row = vdupq_n_s32(0);
+            uint32x4_t sum_row = vdupq_n_u32(0);
 
-            auto matrix_a = reinterpret_cast<const int8_t *>(in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+            const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -109,43 +109,43 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 4 accumulations
             for(; i <= (_k - 4); i += 4)
             {
-                const int8x16_t a0_s8 = vld1q_s8(matrix_a + i * 4);
+                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
 
                 // Convert U8 to U16
-                int16x4x4_t a0_s16 =
+                uint16x4x4_t a0_u16 =
                 {
                     {
-                        vget_low_s16(vmovl_s8(vget_low_s8(a0_s8))),
-                        vget_high_s16(vmovl_s8(vget_low_s8(a0_s8))),
-                        vget_low_s16(vmovl_s8(vget_high_s8(a0_s8))),
-                        vget_high_s16(vmovl_s8(vget_high_s8(a0_s8)))
+                        vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
+                        vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
+                        vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
+                        vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
                     }
                 };
 
                 // Accumulate to U16
-                a0_s16.val[0] = vadd_s16(a0_s16.val[0], a0_s16.val[1]);
-                a0_s16.val[0] = vadd_s16(a0_s16.val[0], a0_s16.val[2]);
-                a0_s16.val[0] = vadd_s16(a0_s16.val[0], a0_s16.val[3]);
+                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
+                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
+                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
 
                 // Accumulate to U32
-                sum_row = vaddw_s16(sum_row, a0_s16.val[0]);
+                sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
             }
 
             // This for loop performs the leftover accumulations
             for(; i < _k; ++i)
             {
-                const int8x8_t a0_s8 = vld1_s8(matrix_a + i * 4);
+                const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
 
                 // Convert U8 to U16
-                const int16x4_t a0_s16 = vget_low_s16(vmovl_s8(a0_s8));
+                const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
 
                 // Accumulate to U32
-                sum_row = vaddw_s16(sum_row, a0_s16);
+                sum_row = vaddw_u16(sum_row, a0_u16);
             }
 
             auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_row, sum_row);
+            vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
         },
         in, out);
     }
@@ -154,10 +154,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            int32x4_t sum_row_s32 = vdupq_n_s32(0);
-            int32_t   sum_row     = 0;
+            uint32x4_t sum_row_u32 = vdupq_n_u32(0);
+            uint32_t   sum_row     = 0;
 
-            auto matrix_a = reinterpret_cast<const int8_t *>(in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]);
+            const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -167,29 +167,29 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 16 accumulations
             for(; i <= (_k - 16); i += 16)
             {
-                const int8x16_t a0_s8 = vld1q_s8(matrix_a + i);
+                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
 
                 // Partial accumulations in U16
-                const int16x8_t tmp_sum0 = vaddl_s8(vget_low_s8(a0_s8), vget_high_s8(a0_s8));
+                const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
 
                 // Accumulate to U32
-                sum_row_s32 = vaddq_s32(sum_row_s32, vpaddlq_s16(tmp_sum0));
+                sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
             }
 
             // This for loop performs the leftover accumulations
             for(; i < _k; ++i)
             {
-                sum_row += static_cast<int32_t>(matrix_a[i]);
+                sum_row += static_cast<uint32_t>(matrix_a[i]);
             }
 
 #if defined(__aarch64__)
             // Reduction operation available on 64 bit architectures only
-            sum_row += vaddvq_s32(sum_row_s32);
+            sum_row += vaddvq_u32(sum_row_u32);
 #else  // __aarch64__
-            int32x2_t tmp = vpadd_s32(vget_high_s32(sum_row_s32), vget_low_s32(sum_row_s32));
-            tmp            = vpadd_s32(tmp, tmp);
+            uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
+            tmp            = vpadd_u32(tmp, tmp);
 
-            sum_row += vget_lane_s32(tmp, 0);
+            sum_row += vget_lane_u32(tmp, 0);
 #endif // __aarch64__
 
             *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
@@ -198,12 +198,12 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
     }
 }
 
-void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b_transposed1xW, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
+void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b_transposed1xW, 1, DataType::S8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
 
-    _input       = mtx_b_transposed1xW;
+    _input       = mtx_b;
     _output      = vector_sum_col;
     _k           = num_mtx_b_rows;
     _is_reshaped = is_transposed1xW;
@@ -246,17 +246,17 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            int32x4x4_t sum_col =
+            uint32x4x4_t sum_col =
             {
                 {
-                    vdupq_n_s32(0),
-                    vdupq_n_s32(0),
-                    vdupq_n_s32(0),
-                    vdupq_n_s32(0)
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0)
                 }
             };
 
-            auto matrix_b = reinterpret_cast<const int8_t *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+            const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -265,14 +265,14 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             int i = 0;
             for(; i < _k; ++i)
             {
-                const int8x16_t b0_s8 = vld1q_s8(matrix_b + i * 16);
+                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
 
                 // Convert S8 to U16
-                const int16x8x2_t b0_s16 =
+                const uint16x8x2_t b0_u16 =
                 {
                     {
-                        vmovl_s8(vget_low_s8(b0_s8)),
-                        vmovl_s8(vget_high_s8(b0_s8))
+                        vmovl_u8(vget_low_u8(b0_u8)),
+                        vmovl_u8(vget_high_u8(b0_u8))
                     }
                 };
 
@@ -280,20 +280,20 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
                 sum_col =
                 {
                     {
-                        vaddw_s16(sum_col.val[0], vget_low_s16(b0_s16.val[0])),
-                        vaddw_s16(sum_col.val[1], vget_high_s16(b0_s16.val[0])),
-                        vaddw_s16(sum_col.val[2], vget_low_s16(b0_s16.val[1])),
-                        vaddw_s16(sum_col.val[3], vget_high_s16(b0_s16.val[1]))
+                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
+                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
                     }
                 };
             }
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_col + 0, sum_col.val[0]);
-            vst1q_s32(vector_sum_col + 4, sum_col.val[1]);
-            vst1q_s32(vector_sum_col + 8, sum_col.val[2]);
-            vst1q_s32(vector_sum_col + 12, sum_col.val[3]);
+            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
+            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
+            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
+            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
         },
         in, out);
     }
@@ -326,17 +326,17 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             }
 
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            int32x4x4_t sum_col =
+            uint32x4x4_t sum_col =
             {
                 {
-                    vdupq_n_s32(0),
-                    vdupq_n_s32(0),
-                    vdupq_n_s32(0),
-                    vdupq_n_s32(0)
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0)
                 }
             };
 
-            auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
+            const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -347,10 +347,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 4 accumulations
             for(; i <= (_k - 4); i += 4)
             {
-                const int8x16_t b0_s8 = vld1q_s8(matrix_b + 0 * in_b_stride);
-                const int8x16_t b1_s8 = vld1q_s8(matrix_b + 1 * in_b_stride);
-                const int8x16_t b2_s8 = vld1q_s8(matrix_b + 2 * in_b_stride);
-                const int8x16_t b3_s8 = vld1q_s8(matrix_b + 3 * in_b_stride);
+                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
+                const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
+                const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
+                const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
 
 #if __arm__
                 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
@@ -360,31 +360,31 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
 #endif /* __arm__ */
 
                 // Partial accumulation in u16
-                int16x8x2_t tmp_sum =
+                uint16x8x2_t tmp_sum =
                 {
                     {
-                        vdupq_n_s16(0),
-                        vdupq_n_s16(0)
+                        vdupq_n_u16(0),
+                        vdupq_n_u16(0)
                     }
                 };
 
-                tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b0_s8));
-                tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b1_s8));
-                tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b2_s8));
-                tmp_sum.val[0] = vaddw_s8(tmp_sum.val[0], vget_low_s8(b3_s8));
-                tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b0_s8));
-                tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b1_s8));
-                tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b2_s8));
-                tmp_sum.val[1] = vaddw_s8(tmp_sum.val[1], vget_high_s8(b3_s8));
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
 
                 // Accumulate to U32
                 sum_col =
                 {
                     {
-                        vaddw_s16(sum_col.val[0], vget_low_s16(tmp_sum.val[0])),
-                        vaddw_s16(sum_col.val[1], vget_high_s16(tmp_sum.val[0])),
-                        vaddw_s16(sum_col.val[2], vget_low_s16(tmp_sum.val[1])),
-                        vaddw_s16(sum_col.val[3], vget_high_s16(tmp_sum.val[1]))
+                        vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
+                        vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
+                        vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
+                        vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
                     }
                 };
 
@@ -394,14 +394,14 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             // This for loop perfoms the leftover accumulations
             for(; i < _k; ++i)
             {
-                const int8x16_t b0_s8 = vld1q_s8(matrix_b + 0 * in_b_stride);
+                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
 
                 // Convert S8 to S16
-                const int16x8x2_t b0_s16 =
+                const uint16x8x2_t b0_u16 =
                 {
                     {
-                        vmovl_s8(vget_low_s8(b0_s8)),
-                        vmovl_s8(vget_high_s8(b0_s8))
+                        vmovl_u8(vget_low_u8(b0_u8)),
+                        vmovl_u8(vget_high_u8(b0_u8))
                     }
                 };
 
@@ -409,10 +409,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
                 sum_col =
                 {
                     {
-                        vaddw_s16(sum_col.val[0], vget_low_s16(b0_s16.val[0])),
-                        vaddw_s16(sum_col.val[1], vget_high_s16(b0_s16.val[0])),
-                        vaddw_s16(sum_col.val[2], vget_low_s16(b0_s16.val[1])),
-                        vaddw_s16(sum_col.val[3], vget_high_s16(b0_s16.val[1]))
+                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
+                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
                     }
                 };
 
@@ -421,10 +421,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_col + 0, sum_col.val[0]);
-            vst1q_s32(vector_sum_col + 4, sum_col.val[1]);
-            vst1q_s32(vector_sum_col + 8, sum_col.val[2]);
-            vst1q_s32(vector_sum_col + 12, sum_col.val[3]);
+            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
+            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
+            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
+            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
         },
         inb, out);
     }
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 7f4ee1ec49..7f83144e12 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -43,7 +43,8 @@ using namespace arm_compute;
 
 void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                  DataType::F16,
                                                   DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
deleted file mode 100644
index 90bc6a205b..0000000000
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_func(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _finalize_kernel(), _vector_sum_col(), _vector_sum_row(), _mm_output(), _a_offset(0),
-      _b_offset(0)
-{
-}
-
-void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t output_mult_int, int32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN((a), 1, DataType::S8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-
-    _a_offset = a_offset;
-    _b_offset = b_offset;
-
-    // Initialize matrix multiply output tensor
-    const TensorShape &shape_mm_output = output->info()->tensor_shape();
-    TensorInfo         info_mm_output(shape_mm_output, 1, DataType::S32);
-    _mm_output.allocator()->init(info_mm_output);
-    _memory_group.manage(&_mm_output);
-
-    // Initialize Matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
-    {
-        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
-        shape_vector_sum_col.remove_dimension(1);
-        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
-        _vector_sum_col.allocator()->init(info_vector_sum_col);
-        _memory_group.manage(&_vector_sum_col);
-
-        // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
-    }
-
-    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
-    {
-        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
-        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
-        shape_vector_sum_row.remove_dimension(1);
-        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
-        _vector_sum_row.allocator()->init(info_vector_sum_row);
-        _memory_group.manage(&_vector_sum_row);
-
-        // Configure Matrix A reduction kernel
-        _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
-    }
-
-    // Configure matrix multiply function
-    _mm_func.configure(a, b, &_mm_output);
-
-    // Configure finalize kernel
-    _finalize_kernel.configure(_a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, &_mm_output, output, a->info()->dimension(0), a_offset, b_offset, c_offset,
-                               output_mult_int, shift);
-
-    // Allocate tensors
-    _mm_output.allocator()->allocate();
-
-    if(_a_offset != 0)
-    {
-        _vector_sum_col.allocator()->allocate();
-    }
-
-    if(_b_offset != 0)
-    {
-        _vector_sum_row.allocator()->allocate();
-    }
-}
-
-void NEGEMMLowp::run()
-{
-    _memory_group.acquire();
-
-    // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
-    {
-        NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
-    }
-
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
-    {
-        NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
-    }
-
-    // Run matrix multiply core function
-    _mm_func.run();
-
-    // Run finalise kernel
-    NEScheduler::get().schedule(&_finalize_kernel, Window::DimY);
-
-    _memory_group.release();
-}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 29104cc378..929ee41220 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -47,19 +47,25 @@ namespace arm_compute
 using namespace arm_compute;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0)
 {
 }
 
 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
     ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
     ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
 
+    bool dot_product_path = false;
+
+    _a_offset = a->info()->quantization_info().offset;
+    _b_offset = b->info()->quantization_info().offset;
+
 #ifdef ARM_COMPUTE_AARCH64_V8_2
     // Check for DOT product instruction
     const struct CPUInfo ci              = NEScheduler::get().cpu_info();
@@ -67,6 +73,13 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
 
     if(cpu_has_dotprod != 0)
     {
+        dot_product_path = true;
+
+        // If the DOT product instruction is available, the computation will be performed in int8_t
+        // In order to take into account this, we need to subtract -128 from a_offset and b_offset
+        _a_offset -= 128;
+        _b_offset -= 128;
+
         // Configure matrix multiply kernel
         struct CPUInfo ci = NEScheduler::get().cpu_info();
         const int      M  = output->info()->tensor_shape().y();
@@ -77,12 +90,11 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         constexpr size_t alignment = 4096;
         _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
         _memory_group.manage(&_workspace);
+
         // Configure matrix multiplication kernel
         auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
         k->configure(a, b, output, &_workspace, 1.f, 1.f);
         _mm_kernel = std::move(k);
-
-        _workspace.allocator()->allocate();
     }
     else
 #endif /* ARM_COMPUTE_AARCH64_V8_2 */
@@ -124,11 +136,58 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             k->configure(&_tmp_a, &_tmp_b, output);
             _mm_kernel = std::move(k);
         }
+    }
 
-        // Allocate tensors
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+        shape_vector_sum_col.remove_dimension(1);
+        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+        _vector_sum_col.allocator()->init(info_vector_sum_col);
+        _memory_group.manage(&_vector_sum_col);
+
+        // Configure Matrix B reduction kernel
+        _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+        shape_vector_sum_row.remove_dimension(1);
+        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+        _vector_sum_row.allocator()->init(info_vector_sum_row);
+        _memory_group.manage(&_vector_sum_row);
+
+        // Configure matrix A reduction kernel
+        _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+    }
+
+    // Configure offset contribution kernel
+    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+    // Allocate tensors
+    if(!dot_product_path)
+    {
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
     }
+    else
+    {
+        _workspace.allocator()->allocate();
+    }
+
+    if(_a_offset != 0)
+    {
+        _vector_sum_col.allocator()->allocate();
+    }
+
+    if(_b_offset != 0)
+    {
+        _vector_sum_row.allocator()->allocate();
+    }
 }
 
 void NEGEMMLowpMatrixMultiplyCore::run()
@@ -147,5 +206,20 @@ void NEGEMMLowpMatrixMultiplyCore::run()
 
     NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
 
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+    }
+
+    // Run offset contribution kernel
+    NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+
     _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000000..d09827f908
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+    k->configure(input, output, result_offset, result_mult_int, result_shift);
+    _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/tests/benchmark/NEON/GEMMLowp.cpp b/tests/benchmark/NEON/GEMMLowp.cpp
index 8cf143393d..a0e5e694bd 100644
--- a/tests/benchmark/NEON/GEMMLowp.cpp
+++ b/tests/benchmark/NEON/GEMMLowp.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
diff --git a/tests/datasets/GEMMLowpDataset.h b/tests/datasets/GEMMLowpDataset.h
index 4bf2a98d61..062c05b1d9 100644
--- a/tests/datasets/GEMMLowpDataset.h
+++ b/tests/datasets/GEMMLowpDataset.h
@@ -37,7 +37,7 @@ namespace datasets
 class GEMMLowpDataset
 {
 public:
-    using type = std::tuple<TensorShape, TensorShape, TensorShape, int32_t, int32_t, int32_t, int32_t, int32_t>;
+    using type = std::tuple<TensorShape, TensorShape, TensorShape, int32_t, int32_t>;
 
     struct iterator
     {
@@ -45,18 +45,12 @@ public:
                  std::vector<TensorShape>::const_iterator b_it,
                  std::vector<TensorShape>::const_iterator c_it,
                  std::vector<int32_t>::const_iterator     a_offset_it,
-                 std::vector<int32_t>::const_iterator     b_offset_it,
-                 std::vector<int32_t>::const_iterator     c_offset_it,
-                 std::vector<int32_t>::const_iterator     c_mult_int_it,
-                 std::vector<int32_t>::const_iterator     out_shift_it)
+                 std::vector<int32_t>::const_iterator     b_offset_it)
             : _a_it{ std::move(a_it) },
               _b_it{ std::move(b_it) },
               _c_it{ std::move(c_it) },
               _a_offset_it{ std::move(a_offset_it) },
-              _b_offset_it{ std::move(b_offset_it) },
-              _c_offset_it{ std::move(c_offset_it) },
-              _c_mult_int_it{ std::move(c_mult_int_it) },
-              _out_shift_it{ std::move(out_shift_it) }
+              _b_offset_it{ std::move(b_offset_it) }
         {
         }
 
@@ -68,15 +62,12 @@ public:
             description << "C=" << *_c_it << ":";
             description << "a_offset=" << *_a_offset_it << ":";
             description << "b_offset=" << *_b_offset_it << ":";
-            description << "c_offset=" << *_c_offset_it << ":";
-            description << "c_mult_int=" << *_c_mult_int_it << ":";
-            description << "out_shift=" << *_out_shift_it << ":";
             return description.str();
         }
 
         GEMMLowpDataset::type operator*() const
         {
-            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_a_offset_it, *_b_offset_it, *_c_offset_it, *_c_mult_int_it, *_out_shift_it);
+            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_a_offset_it, *_b_offset_it);
         }
 
         iterator &operator++()
@@ -86,9 +77,6 @@ public:
             ++_c_it;
             ++_a_offset_it;
             ++_b_offset_it;
-            ++_c_offset_it;
-            ++_c_mult_int_it;
-            ++_out_shift_it;
 
             return *this;
         }
@@ -99,32 +87,25 @@ public:
         std::vector<TensorShape>::const_iterator _c_it;
         std::vector<int32_t>::const_iterator     _a_offset_it;
         std::vector<int32_t>::const_iterator     _b_offset_it;
-        std::vector<int32_t>::const_iterator     _c_offset_it;
-        std::vector<int32_t>::const_iterator     _c_mult_int_it;
-        std::vector<int32_t>::const_iterator     _out_shift_it;
     };
 
     iterator begin() const
     {
-        return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _a_offset.begin(), _b_offset.begin(), _c_offset.begin(), _c_mult_int.begin(), _out_shift.begin());
+        return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _a_offset.begin(), _b_offset.begin());
     }
 
     int size() const
     {
-        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), std::min(_a_offset.size(), std::min(_b_offset.size(), std::min(_c_offset.size(), std::min(_c_mult_int.size(),
-                                                                                                         _out_shift.size())))))));
+        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), std::min(_a_offset.size(), _b_offset.size()))));
     }
 
-    void add_config(TensorShape a, TensorShape b, TensorShape c, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift)
+    void add_config(TensorShape a, TensorShape b, TensorShape c, int32_t a_offset, int32_t b_offset)
     {
         _a_shapes.emplace_back(std::move(a));
         _b_shapes.emplace_back(std::move(b));
         _c_shapes.emplace_back(std::move(c));
         _a_offset.emplace_back(std::move(a_offset));
         _b_offset.emplace_back(std::move(b_offset));
-        _c_offset.emplace_back(std::move(c_offset));
-        _c_mult_int.emplace_back(std::move(c_mult_int));
-        _out_shift.emplace_back(std::move(out_shift));
     }
 
 protected:
@@ -137,9 +118,6 @@ private:
     std::vector<TensorShape> _c_shapes{};
     std::vector<int32_t>     _a_offset{};
     std::vector<int32_t>     _b_offset{};
-    std::vector<int32_t>     _c_offset{};
-    std::vector<int32_t>     _c_mult_int{};
-    std::vector<int32_t>     _out_shift{};
 };
 } // namespace datasets
 } // namespace test
diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h
index 10f79e423d..cc1feb49a2 100644
--- a/tests/datasets/LargeGEMMLowpDataset.h
+++ b/tests/datasets/LargeGEMMLowpDataset.h
@@ -42,12 +42,12 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset
 public:
     LargeGEMMLowpDataset()
     {
-        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, 0, 1, 0);
-        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, 3, 2, 0);
-        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, 1, 1, 0);
-        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, -6, 2, 2);
-        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, 8, 4, 3);
-        add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U), -9, 1, -3, 3, 1);
+        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0);
+        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4);
+        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0);
+        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13);
+        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2);
+        add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U), -9, 1);
     }
 };
 } // namespace datasets
diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h
index b7fe3907ad..881546e70f 100644
--- a/tests/datasets/SmallGEMMLowpDataset.h
+++ b/tests/datasets/SmallGEMMLowpDataset.h
@@ -42,12 +42,12 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset
 public:
     SmallGEMMLowpDataset()
     {
-        add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0, 0, 1, 0);
-        add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4, 3, 2, 0);
-        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, 1, 1, 0);
-        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13, -6, 2, 2);
-        add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 12U), -3, -2, 8, 4, 3);
-        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, -3, 3, 1);
+        add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0);
+        add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4);
+        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0);
+        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13);
+        add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 12U), -3, -2);
+        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1);
     }
 };
 } // namespace datasets
diff --git a/tests/validation/CPP/GEMMLowp.cpp b/tests/validation/CPP/GEMMLowp.cpp
index e1d76503cd..bac3a20c8e 100644
--- a/tests/validation/CPP/GEMMLowp.cpp
+++ b/tests/validation/CPP/GEMMLowp.cpp
@@ -21,10 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "GEMM.h"
+#include "GEMMLowp.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 
 namespace arm_compute
 {
@@ -34,17 +33,21 @@ namespace validation
 {
 namespace reference
 {
-SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, SimpleTensor<int32_t> &c)
+template <typename T>
+SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset)
 {
-    ARM_COMPUTE_UNUSED(a);
-    ARM_COMPUTE_UNUSED(b);
-    ARM_COMPUTE_UNUSED(c);
-    const int            K       = a.shape().x();
-    const int            b_width = b.shape().x();
-    const int            rows    = c.shape().y(); //M
-    const int            cols    = c.shape().x(); //N
+    TensorShape shape(b.shape()[0], a.shape()[1]);
+
+    SimpleTensor<int32_t> c(shape, DataType::S32);
+
+    const int K       = a.shape().x();
+    const int b_width = b.shape().x();
+    const int rows    = c.shape().y(); //M
+    const int cols    = c.shape().x(); //N
+
     std::vector<int32_t> acc;
     acc.resize(cols);
+
     for(int i = 0; i < rows; ++i)
     {
         for(int j = 0; j < cols; ++j)
@@ -53,10 +56,10 @@ SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor
         }
         for(int k = 0; k < K; ++k)
         {
-            auto tmp_a = static_cast<int32_t>(a[k + i * K]);
+            const int32_t tmp_a = a_offset + static_cast<int32_t>(a[k + i * K]);
             for(int j = 0; j < b_width; ++j)
             {
-                auto          tmp_b       = static_cast<int32_t>(b[j + k * b_width]);
+                const int32_t tmp_b       = b_offset + static_cast<int32_t>(b[j + k * b_width]);
                 const int32_t mult_as_int = tmp_a * tmp_b;
                 acc[j] += mult_as_int;
             }
@@ -71,43 +74,21 @@ SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor
 }
 
 template <typename T>
-SimpleTensor<T> gemmlowp(const SimpleTensor<T> &a, const SimpleTensor<T> &b, SimpleTensor<T> &c,
-                         int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
 {
-    const int            K       = a.shape().x();
-    const int            b_width = b.shape().x();
-    const int            rows    = c.shape().y(); //M
-    const int            cols    = c.shape().x(); //N
-    std::vector<int32_t> acc;
-    acc.resize(cols);
-    for(int i = 0; i < rows; ++i)
+    SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
+
+    for(int i = 0; i < in.num_elements(); ++i)
     {
-        for(int j = 0; j < cols; ++j)
-        {
-            acc[j] = 0;
-        }
-        for(int k = 0; k < K; ++k)
-        {
-            const int32_t tmp_a = a_offset + static_cast<int32_t>(a[k + i * K]);
-            for(int j = 0; j < b_width; ++j)
-            {
-                const int32_t tmp_b       = b_offset + static_cast<int32_t>(b[j + k * b_width]);
-                const int32_t mult_as_int = tmp_a * tmp_b;
-                acc[j] += mult_as_int;
-            }
-        }
-        for(int j = 0; j < cols; ++j)
-        {
-            const int32_t result = ((c_offset + acc[j]) * c_mult_int) >> out_shift;
-            c[j + i * cols]      = static_cast<int8_t>(std::min(127, std::max(-128, result)));
-        }
+        const int32_t result = ((in[i] + result_offset) * result_mult_int) >> result_shift;
+        dst[i]               = static_cast<uint8_t>(std::max(0, std::min(255, result)));
     }
 
-    return c;
+    return dst;
 }
 
-template SimpleTensor<int8_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, SimpleTensor<int8_t> &c,
-                                       int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift);
+template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, int32_t a_offset, int32_t b_offset);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/CPP/GEMMLowp.h b/tests/validation/CPP/GEMMLowp.h
index 2f903f2fe2..c09d8f6176 100644
--- a/tests/validation/CPP/GEMMLowp.h
+++ b/tests/validation/CPP/GEMMLowp.h
@@ -35,11 +35,11 @@ namespace validation
 {
 namespace reference
 {
-SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, SimpleTensor<int32_t> &c);
+template <typename T>
+SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset);
 
 template <typename T>
-SimpleTensor<T> gemmlowp(const SimpleTensor<T> &a, const SimpleTensor<T> &b, SimpleTensor<T> &c,
-                         int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 4924f98ea6..4407eff060 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -23,12 +23,15 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
+#include "tests/PaddingCalculator.h"
 #include "tests/datasets/LargeGEMMLowpDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
 #include "tests/datasets/SmallGEMMLowpDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -45,16 +48,13 @@ namespace validation
 {
 namespace
 {
-const auto data_int_blk         = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9);
-const auto data_int_blk_tr      = framework::dataset::make("M", 8, 17) * framework::dataset::make("N", 8, 14) * framework::dataset::make("by", 12) * framework::dataset::make("block", 4);
-const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framework::dataset::make("N", 12, 20) * framework::dataset::make("K", 16);
+const auto data_int_blk    = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9);
+const auto data_int_blk_tr = framework::dataset::make("M", 8, 17) * framework::dataset::make("N", 8, 14) * framework::dataset::make("by", 12) * framework::dataset::make("block", 4);
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(GEMMLowp)
 
-TEST_SUITE(S8)
-
 TEST_SUITE(INTERLEAVE_BLOCKED)
 
 using NEInterleaveBlocked            = NESynthetizeFunction<NEGEMMInterleaveBlockedKernel>;
@@ -77,50 +77,95 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleaveBlockedTransposedFixture, frame
 
 TEST_SUITE_END()
 
-using NEGEMMLowpOffsetFixture = GEMMLowpOffsetValidationFixture<Tensor, Accessor, NEGEMMLowp>;
+TEST_SUITE(MatrixMultiplyCore)
+using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()), framework::dataset::make("DataType",
-                                                                   DataType::S8)),
-               shape_a, shape_b, shape_c, a_offset, b_offset, c_offset, c_mult_int, out_shift, data_type)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
+               shape_a, shape_b, shape_c, a_offset, b_offset)
 {
     // Create tensors
-    Tensor a = create_tensor<Tensor>(shape_a, data_type);
-    Tensor b = create_tensor<Tensor>(shape_b, data_type);
-    Tensor c = create_tensor<Tensor>(shape_c, data_type);
+    Tensor a = create_tensor<Tensor>(shape_a, DataType::QASYMM8);
+    Tensor b = create_tensor<Tensor>(shape_b, DataType::QASYMM8);
+    Tensor c = create_tensor<Tensor>(shape_c, DataType::S32);
+
+    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
+    b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
 
     ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
-    NEGEMMLowp gemmlowp;
-    gemmlowp.configure(&a, &b, &c, a_offset, b_offset, c_offset, c_mult_int, out_shift);
+    NEGEMMLowpMatrixMultiplyCore gemmlowp_mm;
+    gemmlowp_mm.configure(&a, &b, &c);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpOffsetFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpDataset(), framework::dataset::make("DataType", DataType::S8)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpOffsetFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpDataset(), framework::dataset::make("DataType", DataType::S8)))
+TEST_SUITE_END() // MatrixMultiplyCore
+
+TEST_SUITE(OutputStage)
+
+TEST_SUITE(QuantizeDownInt32ToUint8Scale)
+
+using NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToUint8Scale>;
+
+const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -4, 4) * framework::dataset::make("result_mult_int", 1, 3) * framework::dataset::make("result_shift", 2,
+                                                      4);
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases),
+               shape, result_offset, result_mult_int, result_shift)
+{
+    // Create tensors
+    Tensor in  = create_tensor<Tensor>(shape, DataType::S32);
+    Tensor out = create_tensor<Tensor>(shape, DataType::QASYMM8);
+
+    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEGEMMLowpQuantizeDownInt32ToUint8Scale output_stage;
+    output_stage.configure(&in, &out, result_offset, result_mult_int, result_shift);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(in.info()->valid_region(), valid_region);
+    validate(out.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
+    validate(in.info()->padding(), padding);
+    validate(out.info()->padding(), padding);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // U8
 
-TEST_SUITE(S32)
-using NEGEMMLowpMatrixMultiplyFixture = GEMMLowpMatrixMultiplyValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
-FIXTURE_DATA_TEST_CASE(MatrixMultiply, NEGEMMLowpMatrixMultiplyFixture, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_cases))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
+
+TEST_SUITE_END() // OutputStage
+
+TEST_SUITE_END() // GEMMLowp
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index fba44008ba..f9b0dbd959 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -43,36 +43,39 @@ namespace test
 namespace validation
 {
 template <typename TensorType, typename AccessorType, typename FunctionType>
-class GEMMLowpOffsetValidationFixture : public framework::Fixture
+class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift, DataType data_type)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
     {
-        _target    = compute_target(shape_a, shape_b, shape_c, a_offset, b_offset, c_offset, c_mult_int, out_shift, data_type);
-        _reference = compute_reference(shape_a, shape_b, shape_c, a_offset, b_offset, c_offset, c_mult_int, out_shift, data_type);
+        _target    = compute_target(shape_a, shape_b, shape_c, a_offset, b_offset);
+        _reference = compute_reference(shape_a, shape_b, shape_c, a_offset, b_offset);
     }
 
 protected:
     template <typename U>
     void fill(U &&tensor, int i)
     {
-        ARM_COMPUTE_ERROR_ON(tensor.data_type() != DataType::S8);
-        std::uniform_int_distribution<> distribution(0, 3);
+        // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
+        std::uniform_int_distribution<> distribution(1, 254);
         library->fill(tensor, distribution, i);
     }
 
     TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c,
-                              int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift, DataType data_type)
+                              int32_t a_offset, int32_t b_offset)
     {
         // Create tensors
-        TensorType a = create_tensor<TensorType>(shape_a, data_type, 1);
-        TensorType b = create_tensor<TensorType>(shape_b, data_type, 1);
-        TensorType c = create_tensor<TensorType>(shape_c, data_type, 1);
+        TensorType a = create_tensor<TensorType>(shape_a, DataType::QASYMM8, 1);
+        TensorType b = create_tensor<TensorType>(shape_b, DataType::QASYMM8, 1);
+        TensorType c = create_tensor<TensorType>(shape_c, DataType::S32, 1);
+
+        a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
+        b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
 
         // Create and configure function
         FunctionType gemmlowp;
-        gemmlowp.configure(&a, &b, &c, a_offset, b_offset, c_offset, c_mult_int, out_shift);
+        gemmlowp.configure(&a, &b, &c);
 
         ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -90,108 +93,91 @@ protected:
         // Fill tensors
         fill(AccessorType(a), 0);
         fill(AccessorType(b), 1);
-        fill(AccessorType(c), 2);
 
         // Compute GEMM function
         gemmlowp.run();
         return c;
     }
 
-    SimpleTensor<int8_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c,
-                                           int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift, DataType data_type)
+    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c,
+                                            int32_t a_offset, int32_t b_offset)
     {
         // Create reference
-        SimpleTensor<int8_t> a{ shape_a, data_type, 1 };
-        SimpleTensor<int8_t> b{ shape_b, data_type, 1 };
-        SimpleTensor<int8_t> c{ shape_c, data_type, 1 };
+        SimpleTensor<uint8_t> a{ shape_a, DataType::QASYMM8, 1 };
+        SimpleTensor<uint8_t> b{ shape_b, DataType::QASYMM8, 1 };
 
         // Fill reference
         fill(a, 0);
         fill(b, 1);
-        fill(c, 2);
 
-        return reference::gemmlowp<int8_t>(a, b, c, a_offset, b_offset, c_offset, c_mult_int, out_shift);
+        return reference::gemmlowp_matrix_multiply_core<uint8_t>(a, b, a_offset, b_offset);
     }
 
-    TensorType           _target{};
-    SimpleTensor<int8_t> _reference{};
+    TensorType            _target{};
+    SimpleTensor<int32_t> _reference{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType>
-class GEMMLowpMatrixMultiplyValidationFixture : public framework::Fixture
+class GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(size_t m, size_t n, size_t k)
+    void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
     {
-        const TensorShape shape_a(k, m);
-        const TensorShape shape_b(n, k);
-        const TensorShape shape_c(n, m);
-        _target    = compute_target(shape_a, shape_b, shape_c);
-        _reference = compute_reference(shape_a, shape_b, shape_c);
+        _target    = compute_target(shape, result_offset, result_mult_int, result_shift);
+        _reference = compute_reference(shape, result_offset, result_mult_int, result_shift);
     }
 
 protected:
     template <typename U>
-    void fill(U &&tensor, int i, int lo, int hi)
+    void fill(U &&tensor, int i)
     {
-        std::uniform_int_distribution<> distribution(lo, hi);
+        std::uniform_int_distribution<> distribution(-6000, 6000);
         library->fill(tensor, distribution, i);
     }
 
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c)
+    TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
     {
         // Create tensors
-        TensorType a = create_tensor<TensorType>(shape_a, DataType::S8, 1);
-        TensorType b = create_tensor<TensorType>(shape_b, DataType::S8, 1);
-        TensorType c = create_tensor<TensorType>(shape_c, DataType::S32, 1);
+        TensorType a = create_tensor<TensorType>(shape, DataType::S32, 1);
+        TensorType b = create_tensor<TensorType>(shape, DataType::QASYMM8, 1);
 
         // Create and configure function
-        FunctionType gemmlowp;
-        gemmlowp.configure(&a, &b, &c);
+        FunctionType output_stage;
+        output_stage.configure(&a, &b, result_offset, result_mult_int, result_shift);
 
         ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Allocate tensors
         a.allocator()->allocate();
         b.allocator()->allocate();
-        c.allocator()->allocate();
 
         ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Fill tensors
-        fill(AccessorType(a), 0, -128, 127);
-        fill(AccessorType(b), 1, -128, 127);
-        fill(AccessorType(c), 2, 0, 0);
+        fill(AccessorType(a), 0);
 
         // Compute GEMM function
-        gemmlowp.run();
-        return c;
+        output_stage.run();
+        return b;
     }
 
-    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c)
+    SimpleTensor<uint8_t> compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
     {
         // Create reference
-        SimpleTensor<int8_t>  a{ shape_a, DataType::S8, 1 };
-        SimpleTensor<int8_t>  b{ shape_b, DataType::S8, 1 };
-        SimpleTensor<int32_t> c{ shape_c, DataType::S32, 1 };
+        SimpleTensor<int32_t> a{ shape, DataType::S32, 1 };
 
         // Fill reference
-        fill(a, 0, -128, 127);
-        fill(b, 1, -128, 127);
-        fill(c, 2, 0, 0);
+        fill(a, 0);
 
-        return reference::gemmlowp(a, b, c);
+        return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, result_offset, result_mult_int, result_shift);
     }
 
     TensorType            _target{};
-    SimpleTensor<int32_t> _reference{};
+    SimpleTensor<uint8_t> _reference{};
 };
-
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-- 
cgit v1.2.1