From 448a81fcec04333364a1e3266d5081596d3a0477 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 21 Nov 2019 14:10:25 +0000
Subject: COMPMID-2805: Add QASYMM8_SIGNED support in NEGEMMLowpOutputStage

Add support from requantizing down from S32 to Int8 with fixed point
requantization. This involves the following:
- Compute fixed point multiplication between each entry of input by
  result_fixedpoint_multiplier
- Add bias to final result if bias tensor is not a nullptr
- Round to nearest division by a power-of-two using result_shift
- Add offset to each result
- Clamp the value between the specified min and max bounds
- Cast to int8 data type

Change-Id: I641b3fac0833c568d8565ccb859bbc561a24c17d
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2340
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/NEON/NEAsymm.h                    |  60 +++++
 arm_compute/core/NEON/NEKernels.h                  |   1 +
 ...NEGEMMLowpOffsetContributionOutputStageKernel.h |   4 +-
 ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h | 119 ++++++++++
 .../runtime/NEON/functions/NEGEMMLowpOutputStage.h |  59 +++++
 ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp | 246 +++++++++++++++++++++
 .../NEON/functions/NEGEMMLowpOutputStage.cpp       |  14 ++
 tests/validate_examples/cl_gemm.cpp                |   4 +-
 tests/validation/NEON/GEMMLowp.cpp                 | 113 ++++++++++
 tests/validation/fixtures/GEMMLowpFixture.h        | 110 ++++++++-
 tests/validation/reference/GEMMLowp.cpp            | 135 +++++------
 tests/validation/reference/GEMMLowp.h              |  19 +-
 utils/TypePrinter.h                                |   3 +
 13 files changed, 786 insertions(+), 101 deletions(-)
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
 create mode 100644 src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index c75a58046b..40bdd0f5bf 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -115,6 +115,66 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
     return out_u8;
 }
 
+/** Performs final quantization step on 16 elements
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param in_s32                        Input to be quantized.
+ * @param result_fixedpoint_multiplier  Result multiplier parameter
+ * @param result_shift                  Result shift parameter
+ * @param result_offset_after_shift_s32 Result offset parameter
+ * @param min_s8                        Relu lower bound
+ * @param max_s8                        Relu upper bound
+ *
+ * @return Quantized values
+ */
+template <bool is_bounded_relu>
+int8x16_t finalize_quantization(int32x4x4_t &in_s32,
+                                int          result_fixedpoint_multiplier,
+                                int32_t      result_shift,
+                                int32x4_t    result_offset_after_shift_s32,
+                                int8x16_t    min_s8,
+                                int8x16_t    max_s8)
+{
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+    in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+    in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
+    in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
+
+    // Round to the nearest division by a power-of-two using result_shift_s32
+    in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
+    in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
+    in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
+    in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
+
+    // Add the offset terms
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
 /** Performs final quantization step on 16 elements for symmetric quantization
  *
  * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index aa46a346e9..05485d847a 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -80,6 +80,7 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
index c284ca5c5f..dadc5c221b 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
@@ -83,7 +83,7 @@ public:
      * @param[in]  vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
      * @param[in]  bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
      *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[out] output         Output tensor containing the final quantized result. Data type supported: QASYMM8
+     * @param[out] output         Output tensor containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  k              Number of matrix A columns or Matrix B rows
      * @param[in]  a_offset       Offset to be added to each element of the matrix A.
      * @param[in]  b_offset       Offset to be added to each element of the matrix B.
@@ -100,7 +100,7 @@ public:
      *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
      * @param[in] bias           Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
      *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[in] output         Output tensor info containing the final quantized result. Data type supported: QASYMM8
+     * @param[in] output         Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in] a_offset       Offset to be added to each element of the matrix A.
      * @param[in] b_offset       Offset to be added to each element of the matrix B.
      * @param[in] output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..2b3657c728
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
+     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr _func;
+    const ITensor          *_input;
+    const ITensor          *_bias;
+    ITensor                *_output;
+    int                     _result_fixedpoint_multiplier;
+    int                     _result_shift;
+    int                     _result_offset_after_shift;
+    int                     _min;
+    int                     _max;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index 5ece753660..1a65f3b6ce 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -147,6 +147,65 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
 };
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on NEON.
+ *
+ *  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
+ *
+ *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
+ *
+ * The final result is:
+ *
+ * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ *       after the result is shifted right by result_shift
+*/
+class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+     *
+     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
+     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+};
 /** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on NEON.
  *
  *  NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..d24089d615
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(max > 127);
+    ARM_COMPUTE_RETURN_ERROR_ON(min < -128 || min > max);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8_SIGNED));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
+    // NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+template <bool is_bounded_relu>
+void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window &window)
+{
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
+    const int8x16_t min_s8                        = vdupq_n_s8(static_cast<int8_t>(_min));
+    const int8x16_t max_s8                        = vdupq_n_s8(static_cast<int8_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_s8, max_s8);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(_input, win_collapsed);
+    Iterator out(_output, win_collapsed);
+    if(_bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias(_bias, win_biases);
+        execute_window_loop(win_collapsed, [&](const Coordinates &)
+        {
+            // Compute 16 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                int32x4x4_t in_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
+                    }
+                };
+
+                const int32x4x4_t bias_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
+                    }
+                };
+
+                // Add the bias to GEMM's result
+                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                         finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias.ptr()) + x);
+                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                // Add bias
+                in_value += bias_value;
+                // Finalize and store the result
+                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                                                                                                    static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+            }
+        },
+        in, out, bias);
+    }
+    else
+    {
+        execute_window_loop(win_collapsed, [&](const Coordinates &)
+        {
+            // Compute 16 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                int32x4x4_t in_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
+                    }
+                };
+
+                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                         finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                // Finalize and store the result
+                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                                                                                                    static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+            }
+        },
+        in, out);
+    }
+}
+
+NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0)
+{
+}
+
+void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
+                                                                         int result_offset_after_shift, int min, int max)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
+
+    _input                        = input;
+    _bias                         = bias;
+    _output                       = output;
+    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+    _result_shift                 = result_shift;
+    _result_offset_after_shift    = result_offset_after_shift;
+    _min                          = min;
+    _max                          = max;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+
+    // Check if we need to clamp the result using min and max
+    const bool is_bounded_relu = ((min != max) && !(min == -128 && max == 127));
+    _func                      = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run<false>;
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index b89e7a168e..3ef9351b78 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 #include "support/ToolchainSupport.h"
@@ -56,6 +57,19 @@ Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITens
     return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
+void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
+                                                                   int result_offset_after_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
+    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+}
+
 void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
diff --git a/tests/validate_examples/cl_gemm.cpp b/tests/validate_examples/cl_gemm.cpp
index 128c5f6e7f..39fe111448 100644
--- a/tests/validate_examples/cl_gemm.cpp
+++ b/tests/validate_examples/cl_gemm.cpp
@@ -321,11 +321,11 @@ public:
                     SimpleTensor<int32_t> biases{ TensorShape(N), DataType::S32, 1 };
                     // Fill bias
                     fill(biases, 3);
-                    ref_dst = reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(ref_tmp_dst, biases, dst_multiplier_vec, dst_shift_vec, offset_dst);
+                    ref_dst = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, uint8_t>(ref_tmp_dst, biases, dst_multiplier_vec, dst_shift_vec, offset_dst);
                 }
                 else
                 {
-                    ref_dst = reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(ref_tmp_dst, dst_multiplier_vec, dst_shift_vec, offset_dst);
+                    ref_dst = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, uint8_t>(ref_tmp_dst, dst_multiplier_vec, dst_shift_vec, offset_dst);
                 }
                 validate(CLAccessor(dst), ref_dst);
                 break;
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index b79523da1a..78fbc5845f 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -410,6 +410,119 @@ TEST_SUITE_END() // BoundedReLu
 
 TEST_SUITE_END() // QuantizeDownInt32ToUint8ScaleByFixedPoint
 
+TEST_SUITE(QuantizeDownInt32ToInt8ScaleByFixedPoint)
+
+const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
+                                                                   2)
+                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
+
+const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
+                                                                        2)
+                                                                        * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
+
+using NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture =
+    GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>;
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+        framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::F32), // Invalid input data type
+                                                 TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Invalid min and max
+                                                 TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
+                                                 TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),
+        }),
+        framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
+                                                TensorInfo(TensorShape(21U), 1, DataType::S32),
+                                                TensorInfo(TensorShape(20U), 1, DataType::S32),
+                                                TensorInfo(TensorShape(21U), 1, DataType::S32),
+        })),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
+                                                TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
+                                                TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
+                                                TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
+        })),
+        framework::dataset::make("Min",{ -110,
+                                         -130,
+                                         -113,
+                                         -113,
+        })),
+        framework::dataset::make("Max",{ 87,
+                                         140,
+                                         97,
+                                         97,
+        })),
+        framework::dataset::make("Expected", { false, false, false, true })),
+               a_info, b_info, output_info, min, max, expected)
+{
+    // Lock tensors
+    Status status =  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
+                                                                                  &b_info.clone()->set_is_resizable(false),
+                                                                                  &output_info.clone()->set_is_resizable(false),
+                                                                                  min,
+                                                                                  max);
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
+                                                                   quantize_down_int32_to_int8_scale_by_fixedpoint_cases),
+               shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias)
+{
+    TensorShape shape_bias(shape[0]);
+
+    // Create tensors
+    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
+    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
+    Tensor out  = create_tensor<Tensor>(shape, DataType::QASYMM8_SIGNED);
+
+    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint output_stage;
+    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+
+    // Validate valid region input and output
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(in.info()->valid_region(), valid_region);
+    validate(out.info()->valid_region(), valid_region);
+
+    // Validate valid region bias
+    if(add_bias)
+    {
+        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
+        validate(bias.info()->valid_region(), valid_region_bias);
+    }
+
+    // Validate padding
+    const PaddingSize padding(0);
+    validate(in.info()->padding(), padding);
+    validate(out.info()->padding(), padding);
+
+    if(add_bias)
+    {
+        validate(bias.info()->padding(), padding);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
+                       quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE(BoundedReLu)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
+                       quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // BoundedReLu
+TEST_SUITE_END() // QuantizeDownInt32ToInt8ScaleByFixedPoint
+
 TEST_SUITE(QuantizeDownInt32ToInt16ScaleByFixedPoint)
 
 const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 5d092ecac2..c17105edad 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -254,8 +254,8 @@ protected:
                                                                                        output_stage.gemmlowp_offset, output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-                return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(output, bias,
-                                                                                                     output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
+                return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, uint8_t>(output, bias,
+                                                                                               output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not Supported!");
@@ -360,6 +360,101 @@ protected:
     SimpleTensor<uint8_t> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType>
+class GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, int32_t result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max, bool add_bias)
+    {
+        _target    = compute_target(shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias);
+        _reference = compute_reference(shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        std::uniform_int_distribution<> distribution(-6000, 6000);
+        library->fill(tensor, distribution, i);
+    }
+
+    TensorType compute_target(const TensorShape &shape, int32_t result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max, bool add_bias)
+    {
+        TensorShape shape_bias(shape[0]);
+
+        // Create tensors
+        TensorType a = create_tensor<TensorType>(shape, DataType::S32, 1);
+        TensorType b = create_tensor<TensorType>(shape_bias, DataType::S32, 1);
+        TensorType c = create_tensor<TensorType>(shape, DataType::QASYMM8_SIGNED, 1);
+
+        // Create and configure function
+        FunctionType output_stage;
+        output_stage.configure(&a, add_bias ? &b : nullptr, &c, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+
+        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        c.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensor
+        fill(AccessorType(a), 0);
+
+        if(add_bias)
+        {
+            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            // Allocate bias tensor
+            b.allocator()->allocate();
+
+            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            // Fill tensor
+            fill(AccessorType(b), 1);
+        }
+
+        // Compute GEMM function
+        output_stage.run();
+        return c;
+    }
+
+    SimpleTensor<int8_t> compute_reference(const TensorShape &shape, int32_t result_fixed_point_multiplier, int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max,
+                                           bool add_bias)
+    {
+        // Create reference
+        TensorShape shape_bias(shape[0]);
+
+        SimpleTensor<int32_t> a{ shape, DataType::S32, 1 };
+        SimpleTensor<int32_t> b{ shape_bias, DataType::S32, 1 };
+
+        // Fill reference
+        fill(a, 0);
+
+        const std::vector<int32_t> result_fixed_point_multiplier_vec = { result_fixed_point_multiplier };
+        const std::vector<int32_t> result_shift_vec                  = { result_shift };
+
+        if(add_bias)
+        {
+            // Fill bias
+            fill(b, 1);
+
+            return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, int8_t>(a, b, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
+        }
+        else
+        {
+            return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, int8_t>(a, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
+        }
+    }
+
+    TensorType           _target{};
+    SimpleTensor<int8_t> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture : public framework::Fixture
 {
@@ -443,11 +538,11 @@ protected:
             // Fill bias
             fill(b, 1);
 
-            return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(a, b, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
+            return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, uint8_t>(a, b, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
         }
         else
         {
-            return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(a, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
+            return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, uint8_t>(a, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
         }
     }
 
@@ -530,16 +625,19 @@ protected:
         // Fill reference
         fill(a, 0);
 
+        const std::vector<int32_t> result_fixed_point_multiplier_vec = { result_fixed_point_multiplier };
+        const std::vector<int32_t> result_shift_vec                  = { result_shift };
+
         if(add_bias)
         {
             // Fill bias
             fill(b, 1);
 
-            return reference::gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint<int32_t>(a, b, result_fixed_point_multiplier, result_shift, min, max);
+            return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, int16_t>(a, b, result_fixed_point_multiplier_vec, result_shift_vec, 0, min, max);
         }
         else
         {
-            return reference::gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint<int32_t>(a, result_fixed_point_multiplier, result_shift, min, max);
+            return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, int16_t>(a, result_fixed_point_multiplier_vec, result_shift_vec, 0, min, max);
         }
     }
 
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 08be4a5182..4529b91a48 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -38,6 +38,28 @@ namespace reference
 {
 namespace
 {
+template <typename T>
+struct DataTypeExtractor
+{
+    static DataType data_type()
+    {
+        DataType data_type = DataType::UNKNOWN;
+        if(std::is_same<T, int8_t>::value)
+        {
+            data_type = DataType::QASYMM8_SIGNED;
+        }
+        else if(std::is_same<T, uint8_t>::value)
+        {
+            data_type = DataType::QASYMM8;
+        }
+        else if(std::is_same<T, int16_t>::value)
+        {
+            data_type = DataType::QSYMM16;
+        }
+        return data_type;
+    }
+};
+
 template <typename T>
 void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_offset, std::vector<int32_t> result_mult_int,
                                         std::vector<int32_t> result_shift, int32_t min, int32_t max)
@@ -68,16 +90,16 @@ void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleT
     }
 }
 
-template <typename T>
-void quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
-                                                      std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
+template <typename TIn, typename TOut>
+void quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
+                                       std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
     const int  cols_in        = in->shape().x();
     const bool is_per_channel = result_fixedpoint_multiplier.size() > 1;
 
     for(int i = 0; i < in->num_elements(); ++i)
     {
-        int32_t result = (*in)[i];
+        TIn result = (*in)[i];
 
         if(bias != nullptr)
         {
@@ -88,43 +110,15 @@ void quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> *in,
         const int32_t multiplier = (is_per_channel) ? result_fixedpoint_multiplier[i % cols_in] : result_fixedpoint_multiplier[0];
         const int32_t shift      = (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
 
-        result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, multiplier), shift);
-        result += result_offset_after_shift;
-
-        // Bounded ReLu
-        if(min != max)
+        if(shift < 0)
         {
-            result = std::max(min, std::min(max, result));
-        }
-
-        (*dst)[i] = static_cast<uint8_t>(std::max(0, std::min(255, result)));
-    }
-}
-
-template <typename T>
-void quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<int16_t> *dst, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                      int32_t min, int32_t max)
-{
-    const int cols_in = in->shape().x();
-
-    for(int i = 0; i < in->num_elements(); ++i)
-    {
-        int32_t result = (*in)[i];
-
-        if(bias != nullptr)
-        {
-            result += (*bias)[i % cols_in];
-        }
-
-        // Fixed point multiplication
-        if(result_shift < 0)
-        {
-            result = asymm_int_mult(result * (1 << (-result_shift)), result_fixedpoint_multiplier);
+            result = asymm_int_mult(result * (1 << (-shift)), multiplier);
         }
         else
         {
-            result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, result_fixedpoint_multiplier), result_shift);
+            result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, multiplier), shift);
         }
+        result += result_offset_after_shift;
 
         // Bounded ReLu
         if(min != max)
@@ -132,7 +126,8 @@ void quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> *in,
             result = std::max(min, std::min(max, result));
         }
 
-        (*dst)[i] = static_cast<int16_t>(std::max(-32768, std::min(32767, result)));
+        (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
+                                                    std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
     }
 }
 } // namespace
@@ -219,59 +214,43 @@ SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTe
     return dst;
 }
 
-template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
-                                                                                int32_t result_offset_after_shift, int32_t min, int32_t max)
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
+                                                              int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
-    SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
+    SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
 
-    quantize_down_int32_to_uint8_scale_by_fixedpoint<T>(&in, nullptr, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, nullptr, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
 
     return dst;
 }
 
-template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
-                                                                                std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
-{
-    SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
-
-    quantize_down_int32_to_uint8_scale_by_fixedpoint<T>(&in, &bias, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-
-    return dst;
-}
-
-template <typename T>
-SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift, int32_t min,
-                                                                                int32_t max)
-{
-    SimpleTensor<int16_t> dst(in.shape(), DataType::QSYMM16);
-
-    quantize_down_int32_to_int16_scale_by_fixedpoint<T>(&in, nullptr, &dst, result_fixedpoint_multiplier, result_shift, min, max);
-
-    return dst;
-}
-
-template <typename T>
-SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t min, int32_t max)
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                              std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
-    SimpleTensor<int16_t> dst(in.shape(), DataType::QSYMM16);
+    SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
 
-    quantize_down_int32_to_int16_scale_by_fixedpoint<T>(&in, &bias, &dst, result_fixedpoint_multiplier, result_shift, min, max);
+    quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, &bias, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
 
     return dst;
 }
 
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
-                                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
-                                                                                         std::vector<int32_t> result_fixedpoint_multiplier,
-                                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
-template SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                         int32_t min, int32_t max);
-template SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_fixedpoint_multiplier,
-                                                                                         int32_t result_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                          std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
+                                                                          std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                          std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
+                                                                         std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<int16_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                          std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<int16_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
+                                                                          std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                          std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
 template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
                                                                            std::vector<int32_t> result_shift, int32_t min, int32_t max);
 template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 815527e1b7..7ff01ef611 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -52,20 +52,13 @@ template <typename T>
 SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
                                                                   std::vector<int32_t> result_shift, int32_t min = 0, int32_t max = 0);
 
-template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
-                                                                                int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
-
-template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
-                                                                                std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
+                                                              int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
 
-template <typename T>
-SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t min, int32_t max);
-template <typename T>
-SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_fixedpoint_multiplier,
-                                                                                int32_t result_shift, int32_t min, int32_t max);
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                              std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index ede2ea4b63..3f638d7e97 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -628,6 +628,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DataType &data_type)
         case DataType::QASYMM8:
             os << "QASYMM8";
             break;
+        case DataType::QASYMM8_SIGNED:
+            os << "QASYMM8_SIGNED";
+            break;
         case DataType::QSYMM8_PER_CHANNEL:
             os << "QSYMM8_PER_CHANNEL";
             break;
-- 
cgit v1.2.1