13 files changed, 876 insertions, 287 deletions
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
new file mode 100644
index 0000000000..39e030e434
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Output auto inizialitation if not yet initialized
+    {
+        const bool                    is_input_signed   = input->data_type() == DataType::QASYMM8_SIGNED;
+        const DataType                dt                = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qinfo             = input->quantization_info().uniform();
+        const int                     offset_correction = is_input_signed ? -128 : 128;
+        const QuantizationInfo        corrected_qinfo   = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+
+        auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
+    }
+
+    return std::make_pair(Status{}, calculate_max_window(*output));
+}
+} // namespace
+
+NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+    _input  = input;
+    _output = output;
+
+    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const uint8_t mask  = 128;
+    const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+            *(output_ptr + x) = in ^ mask;
+        }
+    },
+    input, output);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index c929983162..a9c04824ae 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -45,9 +45,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
-                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     if(output->total_size() != 0)
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 6cec51d5a2..8f5a208cbb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -722,8 +722,8 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
     TensorShape in0_shape = input0->tensor_shape();
@@ -917,6 +917,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
         switch(_input0->info()->data_type())
         {
             case DataType::S8:
+            case DataType::QASYMM8_SIGNED:
             {
                 matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
                 break;
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 46e53cec12..3ada3a3c4f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -72,6 +72,58 @@ inline int32x4x4_t load(const int32_t *ptr, int32_t x)
     };
 }
 
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
+{
+    return
+    {
+        {
+            vaddq_s32(a.val[0], b),
+            vaddq_s32(a.val[1], b),
+            vaddq_s32(a.val[2], b),
+            vaddq_s32(a.val[3], b)
+        }
+    };
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
+{
+    return
+    {
+        {
+            vaddq_s32(a.val[0], b.val[0]),
+            vaddq_s32(a.val[1], b.val[1]),
+            vaddq_s32(a.val[2], b.val[2]),
+            vaddq_s32(a.val[3], b.val[3])
+        }
+    };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
+{
+    return
+    {
+        {
+            vmulq_n_s32(a.val[0], mul_scalar),
+            vmulq_n_s32(a.val[1], mul_scalar),
+            vmulq_n_s32(a.val[2], mul_scalar),
+            vmulq_n_s32(a.val[3], mul_scalar)
+        }
+    };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
+{
+    return
+    {
+        {
+            vmulq_s32(a.val[0], vld1q_s32(multilpier)),
+            vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+            vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
+            vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
+        }
+    };
+}
+
 inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
 {
     int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
@@ -141,6 +193,82 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
     return out_u8;
 }
 
+template <bool   is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
+template <bool   is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0]));
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1]));
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2]));
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3]));
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
 inline Window get_win_vector_sum(const Window &window)
 {
     Window win_vector_sum(window);
@@ -172,50 +300,12 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
     return bias_it;
 }
 
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b),
-            vaddq_s32(a.val[1], b),
-            vaddq_s32(a.val[2], b),
-            vaddq_s32(a.val[3], b)
-        }
-    };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b.val[0]),
-            vaddq_s32(a.val[1], b.val[1]),
-            vaddq_s32(a.val[2], b.val[2]),
-            vaddq_s32(a.val[3], b.val[3])
-        }
-    };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
-{
-    return
-    {
-        {
-            vmulq_n_s32(a.val[0], mul_scalar),
-            vmulq_n_s32(a.val[1], mul_scalar),
-            vmulq_n_s32(a.val[2], mul_scalar),
-            vmulq_n_s32(a.val[3], mul_scalar)
-        }
-    };
-}
-
 template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
 inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
                                                         const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8,
                                                         int32_t a_offset, int32_t b_offset, int32_t k_offset,
-                                                        GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x)
+                                                        int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
+                                                        int window_step_x, int window_start_x, int window_end_x)
 {
     int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
     if(!is_fixed_point)
@@ -251,12 +341,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
         }
         if(!is_fixed_point)
         {
-            in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier);
+            in_s32 = mul_s32(in_s32, multiplier);
         }
 
         if(is_fixed_point)
         {
-            vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8));
+            vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8));
         }
         else
         {
@@ -280,24 +370,99 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
         if(is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift,
-                                                                         output_stage.gemmlowp_offset, static_cast<uint8_t>(output_stage.gemmlowp_min_bound), static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset, static_cast<uint8_t>(min_bound), static_cast<uint8_t>(max_bound));
         }
         else
         {
             // Finalize quantization
-            in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift;
+            in_value = (in_value * multiplier) >> shift;
 
             // Bound and store the result
             if(is_bounded_relu)
             {
-                in_value = static_cast<uint8_t>(std::max<int32_t>(output_stage.gemmlowp_min_bound, std::min<int32_t>(output_stage.gemmlowp_max_bound, in_value)));
+                in_value = static_cast<uint8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
             *(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
         }
     }
 }
 
+template <bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
+                                                             const int32_t *result_multipliers, const int32_t *result_shifts,
+                                                             const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
+                                                             int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
+                                                             int window_step_x, int window_start_x, int window_end_x)
+{
+    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
+    if(!is_fixed_point)
+    {
+        // Combine quantization offset with other offsets.
+        offset_term_s32 = add_s32(offset_term_s32, result_offset);
+    }
+
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+        if(has_a_offset)
+        {
+            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+        }
+        if(has_bias)
+        {
+            in_s32 = add_s32(in_s32, load(bias_ptr, x));
+        }
+        if(!is_fixed_point)
+        {
+            in_s32 = add_s32(in_s32, offset_term_s32);
+            in_s32 = mul_s32(in_s32, result_multipliers + x);
+        }
+
+        if(is_fixed_point)
+        {
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm<is_bounded_relu>(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8));
+        }
+        else
+        {
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point<is_bounded_relu>(in_s32, load(result_shifts, x), min_s8, max_s8));
+        }
+    }
+    // Compute left-over elements
+    for(; x < window_end_x; ++x)
+    {
+        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+        if(has_a_offset)
+        {
+            in_value += (*(vector_sum_col_ptr + x) * a_offset);
+        }
+        if(has_bias)
+        {
+            in_value += *(bias_ptr + x);
+        }
+
+        if(is_fixed_point)
+        {
+            // Finalize and store the result
+            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound));
+        }
+        else
+        {
+            // Finalize quantization
+            in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
+
+            // Bound and store the result
+            if(is_bounded_relu)
+            {
+                in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+            }
+            *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+        }
+    }
+}
+
 template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
 void run_offset_contribution_output_stage(const Window &window,
                                           const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
@@ -307,10 +472,16 @@ void run_offset_contribution_output_stage(const Window &window,
     const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
     const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
 
-    const int32x4_t  result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset);
-    const int32x4_t  result_shift_s32  = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift);
-    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_min_bound));
-    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+    const int32_t multiplier = output_stage.gemmlowp_multiplier;
+    const int32_t shift      = output_stage.gemmlowp_shift;
+    const int32_t offset     = output_stage.gemmlowp_offset;
+    const int32_t min_bound  = output_stage.gemmlowp_min_bound;
+    const int32_t max_bound  = output_stage.gemmlowp_max_bound;
+
+    const int32x4_t  result_offset_s32 = vdupq_n_s32(offset);
+    const int32x4_t  result_shift_s32  = vdupq_n_s32(is_fixed_point ? shift : -shift);
+    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(min_bound));
+    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(max_bound));
 
     const int  window_step_x  = 16;
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -349,7 +520,8 @@ void run_offset_contribution_output_stage(const Window &window,
                 run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
                                                                                                                out_it,
                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                               output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                               multiplier, shift, offset, min_bound, max_bound,
+                                                                                                               window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -363,7 +535,8 @@ void run_offset_contribution_output_stage(const Window &window,
                                                 + id.y() + (id.z() % depth_input) * height_input;
                 run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
@@ -386,7 +559,8 @@ void run_offset_contribution_output_stage(const Window &window,
                                                 + id.y() + (id.z() % depth_input) * height_input;
                 run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -399,7 +573,8 @@ void run_offset_contribution_output_stage(const Window &window,
                                                 + id.y() + (id.z() % depth_input) * height_input;
                 run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                 window_step_x, window_start_x, window_end_x);
             },
             vector_sum_row_it, mm_result_it, out_it);
         }
@@ -422,7 +597,8 @@ void run_offset_contribution_output_stage(const Window &window,
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
@@ -434,7 +610,8 @@ void run_offset_contribution_output_stage(const Window &window,
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                 window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, mm_result_it, out_it);
         }
@@ -448,7 +625,8 @@ void run_offset_contribution_output_stage(const Window &window,
             {
                 run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                 window_step_x, window_start_x, window_end_x);
             },
             bias_it, mm_result_it, out_it);
         }
@@ -458,7 +636,110 @@ void run_offset_contribution_output_stage(const Window &window,
             {
                 run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
                                                                                                                   result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                  output_stage, window_step_x, window_start_x, window_end_x);
+                                                                                                                  multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                  window_step_x, window_start_x, window_end_x);
+            },
+            mm_result_it, out_it);
+        }
+        return;
+    }
+}
+
+template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+void run_offset_contribution_output_stage_symm(const Window &window,
+                                               const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+                                               int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
+                                               GEMMLowpOutputStageInfo output_stage)
+{
+    ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
+
+    const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    const int32_t offset    = output_stage.gemmlowp_offset;
+    const int32_t min_bound = output_stage.gemmlowp_min_bound;
+    const int32_t max_bound = output_stage.gemmlowp_max_bound;
+
+    const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
+    const int32_t *result_shifts      = output_stage.gemmlowp_shifts.data();
+    const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
+    const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
+    const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+    Iterator mm_result_it(mm_result, win);
+    Iterator out_it(output, win);
+
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                run_offset_contribution_output_stage_window_symm<true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                              result_multipliers, result_shifts,
+                                                                                                              result_offset_s32, min_s8, max_s8,
+                                                                                                              a_offset, offset, min_bound, max_bound,
+                                                                                                              window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                run_offset_contribution_output_stage_window_symm<true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
+                                                                                                               result_multipliers, result_shifts,
+                                                                                                               result_offset_s32, min_s8, max_s8,
+                                                                                                               a_offset, offset, min_bound, max_bound,
+                                                                                                               window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, mm_result_it, out_it);
+        }
+    }
+    else
+    {
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates &)
+            {
+                run_offset_contribution_output_stage_window_symm<false, true, is_bounded_relu, is_fixed_point>(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                               result_multipliers, result_shifts,
+                                                                                                               result_offset_s32, min_s8, max_s8,
+                                                                                                               a_offset, offset, min_bound, max_bound,
+                                                                                                               window_step_x, window_start_x, window_end_x);
+            },
+            bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates &)
+            {
+                run_offset_contribution_output_stage_window_symm<false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, mm_result_it, out_it,
+                                                                                                                result_multipliers, result_shifts,
+                                                                                                                result_offset_s32, min_s8, max_s8,
+                                                                                                                a_offset, offset, min_bound, max_bound,
+                                                                                                                window_step_x, window_start_x, window_end_x);
             },
             mm_result_it, out_it);
         }
@@ -470,8 +751,18 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
                           int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+    if(output->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 127);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < -128);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
 
     if(bias != nullptr)
@@ -525,7 +816,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
     }
 
@@ -551,7 +842,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result,
 }
 
 NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
-get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage)
+get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
 {
     static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function =
     {
@@ -562,7 +853,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
         { 4, &run_offset_contribution_output_stage<false, false, true> },
         { 5, &run_offset_contribution_output_stage<true, false, true> },
         { 6, &run_offset_contribution_output_stage<false, true, true> },
-        { 7, &run_offset_contribution_output_stage<true, true, true> }
+        { 7, &run_offset_contribution_output_stage_symm<true, true, true> },
+        { 8, &run_offset_contribution_output_stage_symm<false, false, false> },
+        { 9, &run_offset_contribution_output_stage_symm<true, false, false> },
+        { 10, &run_offset_contribution_output_stage_symm<false, true, false> },
+        { 11, &run_offset_contribution_output_stage_symm<true, true, false> },
+        { 12, &run_offset_contribution_output_stage_symm<false, false, true> },
+        { 13, &run_offset_contribution_output_stage_symm<true, false, true> },
+        { 14, &run_offset_contribution_output_stage_symm<false, true, true> },
+        { 15, &run_offset_contribution_output_stage_symm<true, true, true> }
     };
 
     // Check if input is a 3D reinterpretation
@@ -574,11 +873,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
     const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound)
                                   && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255));
 
+    // Check if we need to perform fixed point requantization
     const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
 
+    // Check if symmetric per-channel execution
+    const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
     // key acts as a bitset, setting the first bit on reinterpret_as_3d,
     // the second on is_bounded_relu, and the third on is_fixed_point.
-    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
+    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3);
     return map_function.find(key)->second;
 }
 } // namespace
@@ -591,8 +894,9 @@ NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutpu
 }
 
 void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
-                                                              const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k,
-                                                              int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+                                                              const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+                                                              int32_t k, int32_t a_offset, int32_t b_offset,
+                                                              GEMMLowpOutputStageInfo output_stage)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
@@ -627,7 +931,7 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
-    _function = get_configured_function(mm_result, vector_sum_row, output_stage);
+    _function = get_configured_function(mm_result, vector_sum_row, output, output_stage);
 }
 
 Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index c1ee770db5..72632492d7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
 
@@ -48,7 +48,7 @@ namespace
 {
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
     return Status{};
@@ -72,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITens
 
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
     return Status{};
@@ -128,11 +128,12 @@ Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, cons
     return Status{};
 }
 
-void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
 {
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    // Intermediate and final accumulator types
+    using TIAcc = wrapper::traits::promote_t<T>;
+    using TAcc  = wrapper::traits::promote_t<TIAcc>;
 
     Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -149,9 +150,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4_t sum_row = vdupq_n_u32(0);
+            auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
 
-            const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+            const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -161,43 +162,41 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 4 accumulations
             for(; i <= (_k - 4); i += 4)
             {
-                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
 
-                // Convert U8 to U16
-                uint16x4x4_t a0_u16 =
+                // Convert 8-bit to 16-bit
+                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
                 {
-                    {
-                        vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
-                        vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
-                        vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
-                        vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
-                    }
+                    wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+                    wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+                    wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
+                    wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
                 };
 
-                // Accumulate to U16
-                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
-                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
-                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
+                // Accumulate to 16-bit
+                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
+                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
+                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
 
-                // Accumulate to U32
-                sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
+                // Accumulate to 32-bit
+                sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
             }
 
             // This for loop performs the leftover accumulations
             for(; i < _k; ++i)
             {
-                const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
+                const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
 
                 // Convert U8 to U16
-                const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
+                const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
 
                 // Accumulate to U32
-                sum_row = vaddw_u16(sum_row, a0_u16);
+                sum_row = wrapper::vaddw(sum_row, a0_d16);
             }
 
             auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
+            wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row));
         },
         in, out);
     }
@@ -206,10 +205,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4_t sum_row_u32 = vdupq_n_u32(0);
-            uint32_t   sum_row     = 0;
+            auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+            TAcc sum_row  = 0;
 
-            const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+            const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -219,37 +218,57 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 16 accumulations
             for(; i <= (_k - 16); i += 16)
             {
-                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i);
 
                 // Partial accumulations in U16
-                const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
+                const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
 
                 // Accumulate to U32
-                sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
+                vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
             }
 
             // This for loop performs the leftover accumulations
             for(; i < _k; ++i)
             {
-                sum_row += static_cast<uint32_t>(matrix_a[i]);
+                sum_row += static_cast<TAcc>(matrix_a[i]);
             }
 
 #if defined(__aarch64__)
             // Reduction operation available on 64 bit architectures only
-            sum_row += vaddvq_u32(sum_row_u32);
+            sum_row += wrapper::vaddv(vsum_row);
 #else  // __aarch64__
-            uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
-            tmp            = vpadd_u32(tmp, tmp);
+            auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+            tmp      = wrapper::vpadd(tmp, tmp);
 
-            sum_row += vget_lane_u32(tmp, 0);
+            sum_row += wrapper::vgetlane(tmp, 0);
 #endif // __aarch64__
 
-            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
+            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
         },
         in, out);
     }
 }
 
+void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_internal<uint8_t>(window);
+            break;
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            run_internal<int8_t>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
+
 void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
@@ -276,11 +295,12 @@ Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, cons
     return Status{};
 }
 
-void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    // Intermediate and final accumulator types
+    using TIAcc = wrapper::traits::promote_t<T>;
+    using TAcc  = wrapper::traits::promote_t<TIAcc>;
 
     Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -297,17 +317,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4x4_t sum_col =
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
             {
-                {
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0)
-                }
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
             };
 
-            const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
+            const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -316,35 +334,28 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             int i = 0;
             for(; i < _k; ++i)
             {
-                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
+                const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
 
-                // Convert S8 to U16
-                const uint16x8x2_t b0_u16 =
+                // Convert 8bit to 16bit
+                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
                 {
-                    {
-                        vmovl_u8(vget_low_u8(b0_u8)),
-                        vmovl_u8(vget_high_u8(b0_u8))
-                    }
+                    wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+                    wrapper::vmovl(wrapper::vgethigh(b0_b8))
                 };
 
                 // Accumulate to U32
-                sum_col =
-                {
-                    {
-                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
-                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
-                    }
-                };
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
             }
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
-            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
-            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
-            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
         },
         in, out);
     }
@@ -377,17 +388,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             }
 
             // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            uint32x4x4_t sum_col =
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
             {
-                {
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0),
-                    vdupq_n_u32(0)
-                }
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
             };
 
-            const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
+            const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
             asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -398,10 +407,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             // This for loop performs 4 accumulations
             for(; i <= (_k - 4); i += 4)
             {
-                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
-                const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
-                const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
-                const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
+                const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+                const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+                const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
 
 #if __arm__
                 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
@@ -410,34 +419,27 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
                 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
 #endif /* __arm__ */
 
-                // Partial accumulation in u16
-                uint16x8x2_t tmp_sum =
+                // Partial accumulation in 16bit
+                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
                 {
-                    {
-                        vdupq_n_u16(0),
-                        vdupq_n_u16(0)
-                    }
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
                 };
 
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
-                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
-                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
-
-                // Accumulate to U32
-                sum_col =
-                {
-                    {
-                        vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
-                        vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
-                        vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
-                        vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
-                    }
-                };
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
 
                 matrix_b += 4 * in_b_stride;
             }
@@ -445,38 +447,51 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
             // This for loop perfoms the leftover accumulations
             for(; i < _k; ++i)
             {
-                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
+                const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
 
                 // Convert S8 to S16
-                const uint16x8x2_t b0_u16 =
+                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
                 {
-                    {
-                        vmovl_u8(vget_low_u8(b0_u8)),
-                        vmovl_u8(vget_high_u8(b0_u8))
-                    }
+                    wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+                    wrapper::vmovl(wrapper::vgethigh(b0_b8))
                 };
 
-                // Accumulate to U32
-                sum_col =
-                {
-                    {
-                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
-                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
-                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
-                    }
-                };
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
 
                 matrix_b += in_b_stride;
             }
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
-            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
-            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
-            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
         },
         inb, out);
     }
 }
+
+void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_internal<uint8_t>(window, info);
+            break;
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            run_internal<int8_t>(window, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 0ca7fd3dc8..ea3d32e628 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -55,7 +55,7 @@ TensorShape get_output_shape(const ITensorInfo *input)
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::U8, DataType::S8,
                                                          DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 624833adfb..649316442e 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -49,7 +49,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     if(biases != nullptr)
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index fa335d757b..6d276d1322 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -162,6 +162,7 @@ const std::string &arm_compute::string_from_data_type(DataType dt)
         { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
         { DataType::QASYMM8_PER_CHANNEL, "QASYMM8_PER_CHANNEL" },
         { DataType::QASYMM8, "QASYMM8" },
+        { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
         { DataType::QSYMM16, "QSYMM16" },
         { DataType::QASYMM16, "QASYMM16" },
     };
@@ -292,6 +293,7 @@ std::string arm_compute::string_from_pixel_value(const PixelValue &value, const
             converted_string = ss.str();
             break;
         case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
             // Needs conversion to 32 bit, otherwise interpreted as ASCII values
             ss << int32_t(value.get<int8_t>());
@@ -448,6 +450,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const
             print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
             break;
         case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
             print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
             break;
@@ -485,6 +488,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp
         case DataType::QASYMM8_PER_CHANNEL:
             return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
         case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
             return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
         case DataType::U16:
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index cdd48972eb..386d75eca2 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -108,6 +108,44 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier,
 
     return Status{};
 }
+
+arm_compute::Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info,
+                                                                  const QuantizationInfo &wq_info,
+                                                                  const QuantizationInfo &oq_info,
+                                                                  GEMMLowpOutputStageInfo &stage_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(oq_info.scale().empty());
+
+    const unsigned int size = wq_info.scale().size();
+
+    auto &quant_multipliers = stage_info.gemmlowp_multipliers;
+    auto &quant_shifts      = stage_info.gemmlowp_shifts;
+    quant_multipliers.resize(size);
+    quant_shifts.resize(size);
+
+    const auto &w_scales = wq_info.scale();
+    const float i_scale  = iq_info.scale().at(0);
+    const float o_scale  = oq_info.scale().at(0);
+
+    for(unsigned int i = 0; i < size; ++i)
+    {
+        const float multiplier       = i_scale * w_scales[i] / o_scale;
+        int         quant_multiplier = 0;
+        int         quant_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier_less_than_one(multiplier, &quant_multiplier, &quant_shift));
+        quant_multipliers[i] = quant_multiplier;
+        quant_shifts[i]      = quant_shift;
+    }
+
+    // Legacy part
+    stage_info.gemmlowp_shift      = quant_shifts[0];
+    stage_info.gemmlowp_multiplier = quant_multipliers[0];
+
+    return Status{};
+}
+
 std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_type)
 {
     int min_quant_val = 0;
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index b31ecb91e9..43e531579a 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -450,13 +450,24 @@ Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+                                                         DataType::F16, DataType::F32);
+    if(is_data_type_quantized_per_channel(b->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8_SIGNED && d->data_type() != DataType::S32, "Only S32 output supported for QASYMM8_SIGNED input");
     return Status{};
 }
 
@@ -495,6 +506,7 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const
             }
             break;
         case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
             create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
             break;
 #endif /* __aarch64__ */
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index f4377cdaf2..caff117e09 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -59,7 +59,7 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I
 Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     if(biases != nullptr)
@@ -114,18 +114,18 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
-        const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
-
-        input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
-        weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
-        const UniformQuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info().uniform();
-
-        float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier;
-        int   output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        const QuantizationInfo        iqinfo  = input->info()->quantization_info();
+        const QuantizationInfo        wqinfo  = weights->info()->quantization_info();
+        const QuantizationInfo        oqinfo  = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
+        const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
+        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+
+        input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+        if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+        {
+            const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+            weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+        }
 
         // Merge activation with output stage
         int min_activation = 0;
@@ -133,26 +133,25 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
 
         if(supported_acts.count(act_info.activation()) != 0)
         {
-            const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
-            const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+            const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+            const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
 
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
             max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
         }
 
         GEMMLowpOutputStageInfo output_info;
-        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset     = oqinfo.offset;
-        output_info.gemmlowp_multiplier = output_multiplier;
-        output_info.gemmlowp_shift      = output_shift;
-        output_info.gemmlowp_min_bound  = min_activation;
-        output_info.gemmlowp_max_bound  = max_activation;
+        output_info.type               = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset    = uoqinfo.offset;
+        output_info.gemmlowp_min_bound = min_activation;
+        output_info.gemmlowp_max_bound = max_activation;
+        quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
 
         // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
-        input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset));
-        weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, wqinfo.offset));
+        input->info()->set_quantization_info(iqinfo);
+        weights->info()->set_quantization_info(wqinfo);
     }
     else
     {
@@ -176,20 +175,10 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
-        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-
-        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
-        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        input_qa->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
-        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
-        const UniformQuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info().uniform();
-
-        float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier;
-        int   output_shift;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+        const QuantizationInfo       &iqinfo  = input->quantization_info();
+        const QuantizationInfo       &wqinfo  = weights->quantization_info();
+        const QuantizationInfo       &oqinfo  = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
 
         // Merge activation with output stage
         int min_activation = 0;
@@ -201,22 +190,25 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
                                                                                  };
         if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
         {
-            const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
-            const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+            const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+            const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
 
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
             max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
         }
 
         GEMMLowpOutputStageInfo output_info;
-        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset     = oqinfo.offset;
-        output_info.gemmlowp_multiplier = output_multiplier;
-        output_info.gemmlowp_shift      = output_shift;
-        output_info.gemmlowp_min_bound  = min_activation;
-        output_info.gemmlowp_max_bound  = max_activation;
+        output_info.type               = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset    = uoqinfo.offset;
+        output_info.gemmlowp_min_bound = min_activation;
+        output_info.gemmlowp_max_bound = max_activation;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info));
 
         // Perform validation step on GEMMLowp
+        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
         return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
     }
     else
@@ -396,7 +388,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
 
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 63f330be6c..a478fdd231 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,12 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 617d66cf24..01a99f7aca 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,9 +42,9 @@ using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr),
-      _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
-      _fuse_output_stage(false), _run_activation(false)
+      _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+      _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+      _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
 {
 }
 
@@ -56,6 +56,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
 
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = b;
+    GEMMInfo       info     = gemm_info;
 
     // Clear state
     _mtx_a_reshape_kernel = nullptr;
@@ -65,13 +66,41 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     _a_offset                         = a->info()->quantization_info().uniform().offset;
     _b_offset                         = b->info()->quantization_info().uniform().offset;
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-    _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
+    _reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
     _is_prepared                      = false;
     _fused_assembly_path              = false;
+    _flip_signedness                  = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
     _original_b                       = b;
 
+    const ITensor *a_to_use = a;
+
+    // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+    if(_flip_signedness)
+    {
+        const int32_t                 offset_correction = 128;
+        const DataType                dt                = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo iqinfo            = a_to_use->info()->quantization_info().uniform();
+
+        _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
+        _memory_group.manage(&_signed_a);
+        _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+        a_to_use  = &_signed_a;
+        _a_offset = _signed_a.info()->quantization_info().uniform().offset;
+
+        const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
+        _memory_group.manage(&_signed_output);
+        _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
+
+        // Output stage correction
+        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+        output_stage_corr.gemmlowp_offset         = _signed_output.info()->quantization_info().uniform().offset;
+        output_stage_corr.gemmlowp_min_bound -= offset_correction;
+        output_stage_corr.gemmlowp_max_bound -= offset_correction;
+        info.set_gemmlowp_output_stage(output_stage_corr);
+    }
+
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         _fuse_output_stage = true;
         _memory_group.manage(&_mm_result_s32);
@@ -83,17 +112,18 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     switch(a->info()->data_type())
     {
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::U8:
         case DataType::S8:
         {
-            if(a->info()->data_type() == DataType::QASYMM8 && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+            if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
-                _asm_glue.configure(a, b, c, output, gemm_info);
+                _asm_glue.configure(a_to_use, b, c, output, gemm_info);
                 _fused_assembly_path = _asm_glue.is_configured();
             }
             else
             {
-                _asm_glue.configure(a, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
+                _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
             }
             _assembly_path = _asm_glue.is_configured();
             break;
@@ -111,7 +141,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         matrix_b = &_tmp_b;
 
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+        TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
         TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
         _tmp_a.allocator()->init(a_info);
@@ -125,7 +155,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         // Configure interleave kernel
         {
             auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-            k->configure(a, &_tmp_a);
+            k->configure(a_to_use, &_tmp_a);
             _mtx_a_reshape_kernel = std::move(k);
         }
 
@@ -151,19 +181,19 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             }
 
             // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
         if(_b_offset != 0)
         {
-            TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
+            TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
 
             _vector_sum_row.allocator()->init(info_vector_sum_row);
             _memory_group.manage(&_vector_sum_row);
 
             // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);
         }
 
         if(_fuse_output_stage)
@@ -176,8 +206,17 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
                 _mm_kernel = std::move(k);
             }
 
-            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
-                                                               _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
+                                                               _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+                                                               _flip_signedness ? &_signed_output : output,
+                                                               a->info()->dimension(0),
+                                                               _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+            if(_flip_signedness)
+            {
+                _convert_from_signed_asymm.configure(&_signed_output, output);
+            }
         }
         else
         {
@@ -189,7 +228,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
                 _mm_kernel = std::move(k);
             }
             // Configure offset contribution kernel
-            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
         }
     }
 
@@ -228,22 +267,31 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     {
         _mm_result_s32.allocator()->allocate();
     }
+
+    if(_flip_signedness)
+    {
+        _signed_a.allocator()->allocate();
+        _signed_output.allocator()->allocate();
+    }
 }
 
 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
+    GEMMInfo           info          = gemm_info;
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
 
+    const ITensorInfo *a_to_use = a;
+
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
     TensorInfo mm_result_s32_info{};
@@ -251,31 +299,57 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     int32_t a_offset = a->quantization_info().uniform().offset;
     int32_t b_offset = b->quantization_info().uniform().offset;
 
-    bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+    bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
     if(fuse_output_stage)
     {
         auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
     }
 
+    // Convert QASYMM8->QASYMM8_SIGNED
+    TensorInfo signed_a{};
+    TensorInfo signed_output{};
+    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+    if(flip_signedness)
+    {
+        const int32_t                 offset_correction = 128;
+        const DataType                dt                = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
+
+        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+        a_to_use = &signed_a;
+        a_offset = signed_a.quantization_info().uniform().offset;
+
+        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+        // Output stage correction
+        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+        output_stage_corr.gemmlowp_offset         = signed_output.quantization_info().uniform().offset;
+        output_stage_corr.gemmlowp_min_bound -= offset_correction;
+        output_stage_corr.gemmlowp_max_bound -= offset_correction;
+        info.set_gemmlowp_output_stage(output_stage_corr);
+    }
+
     // Check if we need to run the optimized assembly kernel
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
-    if(is_data_type_quantized_asymmetric(a->data_type()))
+    if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, gemm_info));
+        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
         run_optimised_requantized = run_optimised;
     }
     else
     {
-        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
     }
 
     if(run_optimised)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 0)
+        if(info.depth_output_gemm3d() != 0)
         {
-            if(gemm_info.reinterpret_input_as_3d())
+            if(info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -292,8 +366,8 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
 
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
         if(!run_vector_matrix_multiplication)
@@ -312,10 +386,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
 
             // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+            auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
             auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
 
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
         }
     }
@@ -340,7 +414,7 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
             // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));
         }
 
         if(fuse_output_stage)
@@ -354,8 +428,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
                                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c, output, a_offset, b_offset,
-                                                                                                gemm_info.gemmlowp_output_stage()));
+                                                                                                c,
+                                                                                                flip_signedness ? &signed_output : output,
+                                                                                                a_offset, b_offset,
+                                                                                                info.gemmlowp_output_stage()));
         }
         else
         {
@@ -397,6 +473,12 @@ void NEGEMMLowpMatrixMultiplyCore::run()
         NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
     }
 
+    // Convert QASYMM8->QASYMM8_SIGNED
+    if(_flip_signedness)
+    {
+        NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+    }
+
     // Run GEMM
     if(_asm_glue.is_configured())
     {
@@ -433,6 +515,12 @@ void NEGEMMLowpMatrixMultiplyCore::run()
         }
     }
 
+    // Convert QASYMM8_SIGNED->QASYMM8
+    if(_flip_signedness)
+    {
+        NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+    }
+
     // Run fused activation
     if(_run_activation)
     {