From dbdea0d1c025b18d4d82c278c87454427918f5b4 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 16 Oct 2019 19:21:40 +0100
Subject: COMPMID-2308: NEConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL
 filters

Change-Id: Ic1bf5f0d21ccd525f84213a360f7e199d7f50577
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2177
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../kernels/NEConvertQuantizedSignednessKernel.cpp | 136 +++++++
 .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp     |   4 +-
 .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp     |   5 +-
 ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 438 +++++++++++++++++----
 .../NEON/kernels/NEGEMMLowpReductionKernel.cpp     | 275 +++++++------
 src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp |   2 +-
 src/core/NEON/kernels/NEWeightsReshapeKernel.cpp   |   2 +-
 7 files changed, 658 insertions(+), 204 deletions(-)
 create mode 100644 src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp

(limited to 'src/core/NEON')

diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
new file mode 100644
index 0000000000..39e030e434
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Output auto inizialitation if not yet initialized
+    {
+        const bool                    is_input_signed   = input->data_type() == DataType::QASYMM8_SIGNED;
+        const DataType                dt                = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qinfo             = input->quantization_info().uniform();
+        const int                     offset_correction = is_input_signed ? -128 : 128;
+        const QuantizationInfo        corrected_qinfo   = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+
+        auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
+    }
+
+    return std::make_pair(Status{}, calculate_max_window(*output));
+}
+} // namespace
+
+NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+    _input  = input;
+    _output = output;
+
+    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const uint8_t mask  = 128;
+    const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+            *(output_ptr + x) = in ^ mask;
+        }
+    },
+    input, output);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index c929983162..a9c04824ae 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -45,9 +45,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
-                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     if(output->total_size() != 0)
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 6cec51d5a2..8f5a208cbb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -722,8 +722,8 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
     TensorShape in0_shape = input0->tensor_shape();
@@ -917,6 +917,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
         switch(_input0->info()->data_type())
         {
             case DataType::S8:
+            case DataType::QASYMM8_SIGNED:
             {
                 matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
                 break;
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 46e53cec12..3ada3a3c4f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -72,6 +72,58 @@ inline int32x4x4_t load(const int32_t *ptr, int32_t x)
     };
 }
 
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
+{
+    return
+    {
+        {
+            vaddq_s32(a.val[0], b),
+            vaddq_s32(a.val[1], b),
+            vaddq_s32(a.val[2], b),
+            vaddq_s32(a.val[3], b)
+        }
+    };
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
+{
+    return
+    {
+        {
+            vaddq_s32(a.val[0], b.val[0]),
+            vaddq_s32(a.val[1], b.val[1]),
+            vaddq_s32(a.val[2], b.val[2]),
+            vaddq_s32(a.val[3], b.val[3])
+        }
+    };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
+{
+    return
+    {
+        {
+            vmulq_n_s32(a.val[0], mul_scalar),
+            vmulq_n_s32(a.val[1], mul_scalar),
+            vmulq_n_s32(a.val[2], mul_scalar),
+            vmulq_n_s32(a.val[3], mul_scalar)
+        }
+    };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
+{
+    return
+    {
+        {
+            vmulq_s32(a.val[0], vld1q_s32(multilpier)),
+            vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+            vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
+            vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
+        }
+    };
+}
+
 inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
 {
     int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
@@ -141,6 +193,82 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
     return out_u8;
 }
 
+template <bool   is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
+template <bool   is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0]));
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1]));
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2]));
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3]));
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
 inline Window get_win_vector_sum(const Window &window)
 {
     Window win_vector_sum(window);
@@ -172,50 +300,12 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
     return bias_it;
 }
 
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b),
-            vaddq_s32(a.val[1], b),
-            vaddq_s32(a.val[2], b),
-            vaddq_s32(a.val[3], b)
-        }
-    };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b.val[0]),
-            vaddq_s32(a.val[1], b.val[1]),
-            vaddq_s32(a.val[2], b.val[2]),
-            vaddq_s32(a.val[3], b.val[3])
-        }
-    };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
-{
-    return
-    {
-        {
-            vmulq_n_s32(a.val[0], mul_scalar),
-            vmulq_n_s32(a.val[1], mul_scalar),
-            vmulq_n_s32(a.val[2], mul_scalar),
-            vmulq_n_s32(a.val[3], mul_scalar)
-        }
-    };
-}
-
 template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
 inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
                                                         const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8,
                                                         int32_t a_offset, int32_t b_offset, int32_t k_offset,
-                                                        GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x)
+                                                        int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
+                                                        int window_step_x, int window_start_x, int window_end_x)
 {
     int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
     if(!is_fixed_point)
@@ -251,12 +341,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
         }
         if(!is_fixed_point)
         {
-            in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier);
+            in_s32 = mul_s32(in_s32, multiplier);
         }
 
         if(is_fixed_point)
         {
-            vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8));
+            vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8));
         }
         else
         {
@@ -280,24 +370,99 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
         if(is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift,
-                                                                         output_stage.gemmlowp_offset, static_cast<uint8_t>(output_stage.gemmlowp_min_bound), static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset, static_cast<uint8_t>(min_bound), static_cast<uint8_t>(max_bound));
         }
         else
         {
             // Finalize quantization
-            in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift;
+            in_value = (in_value * multiplier) >> shift;
 
             // Bound and store the result
             if(is_bounded_relu)
             {
-                in_value = static_cast<uint8_t>(std::max<int32_t>(output_stage.gemmlowp_min_bound, std::min<int32_t>(output_stage.gemmlowp_max_bound, in_value)));
+                in_value = static_cast<uint8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
             *(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
         }
     }
 }
 
+template <bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
+                                                             const int32_t *result_multipliers, const int32_t *result_shifts,
+                                                             const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
+                                                             int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
+                                                             int window_step_x, int window_start_x, int window_end_x)
+{
+    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
+    if(!is_fixed_point)
+    {
+        // Combine quantization offset with other offsets.
+        offset_term_s32 = add_s32(offset_term_s32, result_offset);
+    }
+
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+        if(has_a_offset)
+        {
+            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+        }
+        if(has_bias)
+        {
+            in_s32 = add_s32(in_s32, load(bias_ptr, x));
+        }
+        if(!is_fixed_point)
+        {
+            in_s32 = add_s32(in_s32, offset_term_s32);
+            in_s32 = mul_s32(in_s32, result_multipliers + x);
+        }
+
+        if(is_fixed_point)
+        {
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm<is_bounded_relu>(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8));
+        }
+        else
+        {
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point<is_bounded_relu>(in_s32, load(result_shifts, x), min_s8, max_s8));
+        }
+    }
+    // Compute left-over elements
+    for(; x < window_end_x; ++x)
+    {
+        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+        if(has_a_offset)
+        {
+            in_value += (*(vector_sum_col_ptr + x) * a_offset);
+        }
+        if(has_bias)
+        {
+            in_value += *(bias_ptr + x);
+        }
+
+        if(is_fixed_point)
+        {
+            // Finalize and store the result
+            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound));
+        }
+        else
+        {
+            // Finalize quantization
+            in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
+
+            // Bound and store the result
+            if(is_bounded_relu)
+            {
+                in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+            }
+            *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+        }
+    }
+}
+
 template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
 void run_offset_contribution_output_stage(const Window &window,
                                           const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
@@ -307,10 +472,16 @@ void run_offset_contribution_output_stage(const Window &window,
     const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
     const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
 
-    const int32x4_t  result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset);
-    const int32x4_t  result_shift_s32  = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift);
-    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_min_bound));
-    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+    const int32_t multiplier = output_stage.gemmlowp_multiplier;
+    const int32_t shift      = output_stage.gemmlowp_shift;
+    const int32_t offset     = output_stage.gemmlowp_offset;
+    const int32_t min_bound  = output_stage.gemmlowp_min_bound;
+    const int32_t max_bound  = output_stage.gemmlowp_max_bound;
+
+    const int32x4_t  result_offset_s32 = vdupq_n_s32(offset);
+    const int32x4_t  result_shift_s32  = vdupq_n_s32(is_fixed_point ? shift : -shift);
+    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(min_bound));
+    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(max_bound));
 
     const int  window_step_x  = 16;
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -349,7 +520,8 @@ void run_offset_contribution_output_stage(const Window &window,
                 run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
                                                                                                                out_it,
                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                               output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                               multiplier, shift, offset, min_bound, max_bound,
+                                                                                                               window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -363,7 +535,8 @@ void run_offset_contribution_output_stage(const Window &window,
                                                 + id.y() + (id.z() % depth_input) * height_input;
                 run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
@@ -386,7 +559,8 @@ void run_offset_contribution_output_stage(const Window &window,
                                                 + id.y() + (id.z() % depth_input) * height_input;
                 run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -399,7 +573,8 @@ void run_offset_contribution_output_stage(const Window &window,
                                                 + id.y() + (id.z() % depth_input) * height_input;
                 run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                 window_step_x, window_start_x, window_end_x);
             },
             vector_sum_row_it, mm_result_it, out_it);
         }
@@ -422,7 +597,8 @@ void run_offset_contribution_output_stage(const Window &window,
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
@@ -434,7 +610,8 @@ void run_offset_contribution_output_stage(const Window &window,
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                 window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, mm_result_it, out_it);
         }
@@ -448,7 +625,8 @@ void run_offset_contribution_output_stage(const Window &window,
             {
                 run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                 window_step_x, window_start_x, window_end_x);
             },
             bias_it, mm_result_it, out_it);
         }
@@ -458,7 +636,110 @@ void run_offset_contribution_output_stage(const Window &window,
             {
                 run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
                                                                                                                   result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                  output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                  multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                  window_step_x, window_start_x, window_end_x);
+            },
+            mm_result_it, out_it);
+        }
+        return;
+    }
+}
+
+template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+void run_offset_contribution_output_stage_symm(const Window &window,
+                                               const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+                                               int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
+                                               GEMMLowpOutputStageInfo output_stage)
+{
+    ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
+
+    const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    const int32_t offset    = output_stage.gemmlowp_offset;
+    const int32_t min_bound = output_stage.gemmlowp_min_bound;
+    const int32_t max_bound = output_stage.gemmlowp_max_bound;
+
+    const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
+    const int32_t *result_shifts      = output_stage.gemmlowp_shifts.data();
+    const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
+    const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
+    const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+    Iterator mm_result_it(mm_result, win);
+    Iterator out_it(output, win);
+
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                run_offset_contribution_output_stage_window_symm<true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                              result_multipliers, result_shifts,
+                                                                                                              result_offset_s32, min_s8, max_s8,
+                                                                                                              a_offset, offset, min_bound, max_bound,
+                                                                                                              window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                run_offset_contribution_output_stage_window_symm<true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
+                                                                                                               result_multipliers, result_shifts,
+                                                                                                               result_offset_s32, min_s8, max_s8,
+                                                                                                               a_offset, offset, min_bound, max_bound,
+                                                                                                               window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, mm_result_it, out_it);
+        }
+    }
+    else
+    {
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates &)
+            {
+                run_offset_contribution_output_stage_window_symm<false, true, is_bounded_relu, is_fixed_point>(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                               result_multipliers, result_shifts,
+                                                                                                               result_offset_s32, min_s8, max_s8,
+                                                                                                               a_offset, offset, min_bound, max_bound,
+                                                                                                               window_step_x, window_start_x, window_end_x);
+            },
+            bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates &)
+            {
+                run_offset_contribution_output_stage_window_symm<false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, mm_result_it, out_it,
+                                                                                                                result_multipliers, result_shifts,
+                                                                                                                result_offset_s32, min_s8, max_s8,
+                                                                                                                a_offset, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             mm_result_it, out_it);
         }
@@ -470,8 +751,18 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
                           int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+    if(output->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 127);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < -128);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
 
     if(bias != nullptr)
@@ -525,7 +816,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
     }
 
@@ -551,7 +842,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result,
 }
 
 NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
-get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage)
+get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
 {
     static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function =
     {
@@ -562,7 +853,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
         { 4, &run_offset_contribution_output_stage<false, false, true> },
         { 5, &run_offset_contribution_output_stage<true, false, true> },
         { 6, &run_offset_contribution_output_stage<false, true, true> },
-        { 7, &run_offset_contribution_output_stage<true, true, true> }
+        { 7, &run_offset_contribution_output_stage_symm<true, true, true> },
+        { 8, &run_offset_contribution_output_stage_symm<false, false, false> },
+        { 9, &run_offset_contribution_output_stage_symm<true, false, false> },
+        { 10, &run_offset_contribution_output_stage_symm<false, true, false> },
+        { 11, &run_offset_contribution_output_stage_symm<true, true, false> },
+        { 12, &run_offset_contribution_output_stage_symm<false, false, true> },
+        { 13, &run_offset_contribution_output_stage_symm<true, false, true> },
+        { 14, &run_offset_contribution_output_stage_symm<false, true, true> },
+        { 15, &run_offset_contribution_output_stage_symm<true, true, true> }
     };
 
     // Check if input is a 3D reinterpretation
@@ -574,11 +873,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
     const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound)
                                   && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255));
 
+    // Check if we need to perform fixed point requantization
     const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
 
+    // Check if symmetric per-channel execution
+    const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
     // key acts as a bitset, setting the first bit on reinterpret_as_3d,
     // the second on is_bounded_relu, and the third on is_fixed_point.
-    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
+    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3);
     return map_function.find(key)->second;
 }
 } // namespace
@@ -591,8 +894,9 @@ NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutpu
 }
 
 void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
-                                                              const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k,
-                                                              int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+                                                              const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+                                                              int32_t k, int32_t a_offset, int32_t b_offset,
+                                                              GEMMLowpOutputStageInfo output_stage)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
@@ -627,7 +931,7 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
-    _function = get_configured_function(mm_result, vector_sum_row, output_stage);
+    _function = get_configured_function(mm_result, vector_sum_row, output, output_stage);
 }
 
 Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index c1ee770db5..72632492d7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
 
@@ -48,7 +48,7 @@ namespace
 {
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
     return Status{};
@@ -72,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITens
 
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
     return Status{};
@@ -128,11 +128,12 @@ Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, cons
     return Status{};
 }
 
-void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
 {
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    // Intermediate and final accumulator types
+    using TIAcc = wrapper::traits::promote_t<T>;
+    using TAcc  = wrapper::traits::promote_t<TIAcc>;
 
     Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -149,9 +150,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4_t sum_row = vdupq_n_u32(0);
+            auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
 
-            const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+            const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -161,43 +162,41 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 4 accumulations
             for(; i <= (_k - 4); i += 4)
             {
-                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
 
-                // Convert U8 to U16
-                uint16x4x4_t a0_u16 =
+                // Convert 8-bit to 16-bit
+                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
                 {
-                    {
-                        vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
-                        vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
-                        vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
-                        vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
-                    }
+                    wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+                    wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+                    wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
+                    wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
                 };
 
-                // Accumulate to U16
-                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
-                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
-                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
+                // Accumulate to 16-bit
+                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
+                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
+                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
 
-                // Accumulate to U32
-                sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
+                // Accumulate to 32-bit
+                sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
             }
 
             // This for loop performs the leftover accumulations
             for(; i < _k; ++i)
             {
-                const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
+                const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
 
                 // Convert U8 to U16
-                const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
+                const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
 
                 // Accumulate to U32
-                sum_row = vaddw_u16(sum_row, a0_u16);
+                sum_row = wrapper::vaddw(sum_row, a0_d16);
             }
 
             auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
+            wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row));
         },
         in, out);
     }
@@ -206,10 +205,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4_t sum_row_u32 = vdupq_n_u32(0);
-            uint32_t   sum_row     = 0;
+            auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+            TAcc sum_row  = 0;
 
-            const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+            const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -219,37 +218,57 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 16 accumulations
             for(; i <= (_k - 16); i += 16)
             {
-                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i);
 
                 // Partial accumulations in U16
-                const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
+                const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
 
                 // Accumulate to U32
-                sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
+                vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
             }
 
             // This for loop performs the leftover accumulations
             for(; i < _k; ++i)
             {
-                sum_row += static_cast<uint32_t>(matrix_a[i]);
+                sum_row += static_cast<TAcc>(matrix_a[i]);
             }
 
 #if defined(__aarch64__)
             // Reduction operation available on 64 bit architectures only
-            sum_row += vaddvq_u32(sum_row_u32);
+            sum_row += wrapper::vaddv(vsum_row);
 #else  // __aarch64__
-            uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
-            tmp            = vpadd_u32(tmp, tmp);
+            auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+            tmp      = wrapper::vpadd(tmp, tmp);
 
-            sum_row += vget_lane_u32(tmp, 0);
+            sum_row += wrapper::vgetlane(tmp, 0);
 #endif // __aarch64__
 
-            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
+            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
         },
         in, out);
     }
 }
 
+void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_internal<uint8_t>(window);
+            break;
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            run_internal<int8_t>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
+
 void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
@@ -276,11 +295,12 @@ Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, cons
     return Status{};
 }
 
-void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    // Intermediate and final accumulator types
+    using TIAcc = wrapper::traits::promote_t<T>;
+    using TAcc  = wrapper::traits::promote_t<TIAcc>;
 
     Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -297,17 +317,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4x4_t sum_col =
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
             {
-                {
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0)
-                }
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
             };
 
-            const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
+            const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -316,35 +334,28 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             int i = 0;
             for(; i < _k; ++i)
             {
-                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
+                const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
 
-                // Convert S8 to U16
-                const uint16x8x2_t b0_u16 =
+                // Convert 8bit to 16bit
+                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
                 {
-                    {
-                        vmovl_u8(vget_low_u8(b0_u8)),
-                        vmovl_u8(vget_high_u8(b0_u8))
-                    }
+                    wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+                    wrapper::vmovl(wrapper::vgethigh(b0_b8))
                 };
 
                 // Accumulate to U32
-                sum_col =
-                {
-                    {
-                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
-                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
-                    }
-                };
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
             }
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
-            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
-            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
-            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
         },
         in, out);
     }
@@ -377,17 +388,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             }
 
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4x4_t sum_col =
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
             {
-                {
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0)
-                }
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
             };
 
-            const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
+            const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -398,10 +407,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 4 accumulations
             for(; i <= (_k - 4); i += 4)
             {
-                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
-                const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
-                const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
-                const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
+                const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+                const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+                const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
 
 #if __arm__
                 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
@@ -410,34 +419,27 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
                 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
 #endif /* __arm__ */
 
-                // Partial accumulation in u16
-                uint16x8x2_t tmp_sum =
+                // Partial accumulation in 16bit
+                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
                 {
-                    {
-                        vdupq_n_u16(0),
-                        vdupq_n_u16(0)
-                    }
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
                 };
 
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
-
-                // Accumulate to U32
-                sum_col =
-                {
-                    {
-                        vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
-                        vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
-                        vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
-                        vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
-                    }
-                };
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
 
                 matrix_b += 4 * in_b_stride;
             }
@@ -445,38 +447,51 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             // This for loop perfoms the leftover accumulations
             for(; i < _k; ++i)
             {
-                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
+                const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
 
                 // Convert S8 to S16
-                const uint16x8x2_t b0_u16 =
+                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
                 {
-                    {
-                        vmovl_u8(vget_low_u8(b0_u8)),
-                        vmovl_u8(vget_high_u8(b0_u8))
-                    }
+                    wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+                    wrapper::vmovl(wrapper::vgethigh(b0_b8))
                 };
 
-                // Accumulate to U32
-                sum_col =
-                {
-                    {
-                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
-                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
-                    }
-                };
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
 
                 matrix_b += in_b_stride;
             }
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
-            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
-            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
-            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
         },
         inb, out);
     }
 }
+
+void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_internal<uint8_t>(window, info);
+            break;
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            run_internal<int8_t>(window, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 0ca7fd3dc8..ea3d32e628 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -55,7 +55,7 @@ TensorShape get_output_shape(const ITensorInfo *input)
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::U8, DataType::S8,
                                                          DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 624833adfb..649316442e 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -49,7 +49,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     if(biases != nullptr)
-- 
cgit v1.2.1