COMPMID-970 : Remove QS8 / QS16 support

Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com> 2018-07-04 09:34:00 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:54:10 +0000
commit: 7485d5a62685cb745ab50e970adb722cb71557ac (patch)
tree: ba01b99ca466c93edc9a3f8c1e34394ff84be060 /src/core/NEON/kernels
parent: 014333d73883c3872e458cedda5ccef586a7ccd4 (diff)
download: ComputeLibrary-7485d5a62685cb745ab50e970adb722cb71557ac.tar.gz
37 files changed, 142 insertions, 2846 deletions
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index ec125154a4..bdc93ed1b8 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 
-#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
@@ -46,14 +45,13 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
     if((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
@@ -146,36 +144,6 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
     };
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
 
-    // Activation functions : QS8
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
-    {
-        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint8_t> },
-        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint8_t> },
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
-        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
-        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
-        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint8_t> },
-        { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint8_t> },
-        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
-        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
-        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
-    };
-    // Activation functions : QS16
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs16 =
-    {
-        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint16_t> },
-        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint16_t> },
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint16_t> },
-        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint16_t> },
-        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint16_t> },
-        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint16_t> },
-        { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint16_t> },
-        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint16_t> },
-        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint16_t> },
-        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint16_t> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint16_t> },
-    };
     // Activation functions : QASYMM8
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
     {
@@ -188,12 +156,6 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
         case DataType::QASYMM8:
             _func = act_map_qasymm8[activation_info.activation()];
             break;
-        case DataType::QS8:
-            _func = act_map_qs8[activation_info.activation()];
-            break;
-        case DataType::QS16:
-            _func = act_map_qs16[activation_info.activation()];
-            break;
         case DataType::F32:
             _func = act_map_f32[activation_info.activation()];
             break;
@@ -508,70 +470,6 @@ typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationL
 }
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
-{
-    Iterator  input(_input, window);
-    Iterator  output(_output, window);
-    const int fixed_point_position = _input->info()->fixed_point_position();
-
-    static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
-    const qint8x16_t        CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
-    const qint8x16_t        a       = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position));
-    const qint8x16_t        b       = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-        const qint8x16_t in  = vld1q_qs8(input_ptr);
-        qint8x16_t       tmp = {};
-
-        switch(F)
-        {
-            case ActivationFunction::ABS:
-                tmp = vqabsq_qs8(in);
-                break;
-            case ActivationFunction::LINEAR:
-                tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
-                break;
-            case ActivationFunction::LOGISTIC:
-                tmp = vqrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
-                break;
-            case ActivationFunction::RELU:
-                tmp = vmaxq_qs8(CONST_0, in);
-                break;
-            case ActivationFunction::BOUNDED_RELU:
-                tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
-                break;
-            case ActivationFunction::LU_BOUNDED_RELU:
-                tmp = vminq_qs8(a, vmaxq_qs8(b, in));
-                break;
-            case ActivationFunction::LEAKY_RELU:
-                tmp = vbslq_s8(vcgtq_s8(in, CONST_0), in, vmulq_qs8(a, in, fixed_point_position));
-                break;
-            case ActivationFunction::SOFT_RELU:
-                tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
-                break;
-            case ActivationFunction::SQRT:
-                tmp = vqrecipq_qs8(vqinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
-                break;
-            case ActivationFunction::SQUARE:
-                tmp = vqmulq_qs8(in, in, fixed_point_position);
-                break;
-            case ActivationFunction::TANH:
-                tmp = vqmulq_qs8(a, vqtanhq_qs8(vqmulq_qs8(b, in, fixed_point_position), fixed_point_position), fixed_point_position);
-                break;
-            default:
-                break;
-        }
-
-        vst1q_qs8(output_ptr, tmp);
-    },
-    input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
     Iterator               input(_input, window);
@@ -620,137 +518,6 @@ typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivat
     input, output);
 }
 
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
-{
-    Iterator  input(_input, window);
-    Iterator  output(_output, window);
-    const int fixed_point_position = _input->info()->fixed_point_position();
-
-    static const qint16x8_t CONST_0 = vdupq_n_qs16(0);
-    const qint16x8_t        CONST_1 = vdupq_n_qs16(sqcvt_qs16_f32(1.f, fixed_point_position));
-    const qint16x8_t        a       = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.a(), fixed_point_position));
-    const qint16x8_t        b       = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.b(), fixed_point_position));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        const qint16x8x2_t in  = vld2q_s16(input_ptr);
-        qint16x8x2_t       tmp = { {} };
-
-        switch(F)
-        {
-            case ActivationFunction::ABS:
-                tmp =
-                {
-                    {
-                        vqabsq_qs16(in.val[0]),
-                        vqabsq_qs16(in.val[1]),
-                    }
-                };
-                break;
-            case ActivationFunction::LINEAR:
-                tmp =
-                {
-                    {
-                        vqmlaq_qs16(b, a, in.val[0], fixed_point_position),
-                        vqmlaq_qs16(b, a, in.val[1], fixed_point_position),
-                    }
-                };
-                break;
-            case ActivationFunction::LOGISTIC:
-                tmp =
-                {
-                    {
-                        vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[0]), fixed_point_position)), fixed_point_position),
-                        vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[1]), fixed_point_position)), fixed_point_position),
-                    }
-                };
-                break;
-            case ActivationFunction::RELU:
-                tmp =
-                {
-                    {
-                        vmaxq_qs16(CONST_0, in.val[0]),
-                        vmaxq_qs16(CONST_0, in.val[1]),
-                    }
-                };
-                break;
-            case ActivationFunction::BOUNDED_RELU:
-                tmp =
-                {
-                    {
-                        vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[0])),
-                        vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[1])),
-                    }
-                };
-                break;
-            case ActivationFunction::LU_BOUNDED_RELU:
-                tmp =
-                {
-                    {
-                        vminq_qs16(a, vmaxq_qs16(b, in.val[0])),
-                        vminq_qs16(a, vmaxq_qs16(b, in.val[1])),
-                    }
-                };
-                break;
-            case ActivationFunction::LEAKY_RELU:
-                tmp =
-                {
-                    {
-                        vbslq_s16(vcgtq_s16(in.val[0], CONST_0), in.val[0], vmulq_qs16(a, in.val[0], fixed_point_position)),
-                        vbslq_s16(vcgtq_s16(in.val[1], CONST_0), in.val[1], vmulq_qs16(a, in.val[1], fixed_point_position)),
-                    }
-                };
-                break;
-            case ActivationFunction::SOFT_RELU:
-                tmp =
-                {
-                    {
-                        vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[0], fixed_point_position)), fixed_point_position),
-                        vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[1], fixed_point_position)), fixed_point_position),
-                    }
-                };
-                break;
-            case ActivationFunction::SQRT:
-                tmp =
-                {
-                    {
-                        vqrecipq_qs16(vqinvsqrtq_qs16(in.val[0], fixed_point_position), fixed_point_position),
-                        vqrecipq_qs16(vqinvsqrtq_qs16(in.val[1], fixed_point_position), fixed_point_position),
-                    }
-                };
-                break;
-            case ActivationFunction::SQUARE:
-                tmp =
-                {
-                    {
-                        vqmulq_qs16(in.val[0], in.val[0], fixed_point_position),
-                        vqmulq_qs16(in.val[1], in.val[1], fixed_point_position),
-                    }
-                };
-                break;
-            case ActivationFunction::TANH:
-                tmp =
-                {
-                    {
-                        vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[0], fixed_point_position), fixed_point_position), fixed_point_position),
-                        vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[1], fixed_point_position), fixed_point_position), fixed_point_position),
-                    }
-                };
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Function not implemented");
-                break;
-        }
-
-        vst2q_qs16(output_ptr, tmp);
-    },
-    input, output);
-}
-
 Status NEActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index a487090a98..f8e2b6d73e 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -48,38 +48,6 @@ namespace
 {
 constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
-        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
-        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vaddq_qs8(a, b));
-    },
-    input1, input2, output);
-}
-
-void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
-        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
-        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqaddq_qs8(a, b));
-    },
-    input1, input2, output);
-}
-
 void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
@@ -362,28 +330,21 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
 {
     ARM_COMPUTE_UNUSED(policy);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
-    if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(input2.data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &input2);
-    }
-
     // Validate in case of configured output
     if(output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            !(input1.data_type() == DataType::QS8 && input2.data_type() == DataType::QS8 && output.data_type() == DataType::QS8)
-            && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
+            !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
             && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
-            && !(input1.data_type() == DataType::QS16 && input2.data_type() == DataType::QS16 && output.data_type() == DataType::QS16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
             && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
@@ -391,11 +352,6 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
 
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
                                         "Wrong shape for output");
-
-        if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(output.data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &output);
-        }
     }
 
     return Status{};
@@ -460,8 +416,6 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor
 
     static std::map<std::string, AddFunction *> map_function =
     {
-        { "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 },
-        { "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 },
         { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
         { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
         { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
@@ -470,8 +424,6 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor
         { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
         { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
         { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
-        { "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 },
-        { "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 },
         { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
         { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
         { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 3db80285c0..5a162e3b2c 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,38 +45,6 @@ class Coordinates;
 
 namespace
 {
-void sub_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
-        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
-        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vsubq_qs8(a, b));
-    },
-    input1, input2, output);
-}
-
-void sub_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
-        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
-        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqsubq_qs8(a, b));
-    },
-    input1, input2, output);
-}
-
 void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     Iterator input1(in1, window);
@@ -353,23 +321,15 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
 {
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-
-    if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
-    {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        !(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8 && output->data_type() == DataType::QS8)
-        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
+        !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
         && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
         && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
         && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::QS16 && input2->data_type() == DataType::QS16 && output->data_type() == DataType::QS16)
         && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
         && !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32)
         && !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16),
@@ -432,8 +392,6 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens
 
     static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
     {
-        { "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 },
-        { "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 },
         { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
         { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
         { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
@@ -442,8 +400,6 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens
         { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
         { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
         { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
-        { "sub_wrap_QS16_QS16_QS16", &sub_wrap_S16_S16_S16 },
-        { "sub_saturate_QS16_QS16_QS16", &sub_saturate_S16_S16_S16 },
         { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
         { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
         { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 6be50fdb0d..6aed41f3aa 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -43,7 +43,7 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
                    const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16,
                                                          DataType::F32);
 
     if(act_info.enabled())
@@ -60,22 +60,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
     if(beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
     }
     if(gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
@@ -104,112 +100,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } //namespace
 
 template <bool fused_activation>
-void NEBatchNormalizationLayerKernel::batch_normalization_qs8(const Window &window)
-{
-    static_assert(!fused_activation, "Activation is not supported for QS8");
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    // Hold information about the current feature map we are iterating.
-    // Only compute denominator and NEON vectors once per feature map.
-    int slice = -1;
-
-    const int  fixed_point_position = _input->info()->fixed_point_position();
-    const auto input_mean           = reinterpret_cast<const qint8_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var            = reinterpret_cast<const qint8_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma          = (_gamma != nullptr) ? reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta           = (_beta != nullptr) ? reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    qint8x16_t       mean_vec    = vdupq_n_qs8(0);
-    qint8x16_t       var_vec     = vdupq_n_qs8(0);
-    qint8x16_t       gamma_vec   = vdupq_n_qs8(sqcvt_qs8_f32(1, fixed_point_position));
-    qint8x16_t       beta_vec    = vdupq_n_qs8(sqcvt_qs8_f32(0, fixed_point_position));
-    qint8x16_t       denominator = vdupq_n_qs8(0);
-    const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position));
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(slice != id.z())
-        {
-            // Conctruct vectors
-            mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
-            var_vec  = vdupq_n_qs8(*(input_var + id.z()));
-            if(input_gamma != nullptr)
-            {
-                gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
-            }
-            if(input_beta != nullptr)
-            {
-                beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
-            }
-
-            // Calculate denominator
-            denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
-            slice       = id.z();
-        }
-
-        // Calculate x bar and store results
-        const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
-        const qint8x16_t x_bar     = vqmulq_qs8(numerator, denominator, fixed_point_position);
-        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
-    },
-    input, output);
-}
-
-template <bool fused_activation>
-void NEBatchNormalizationLayerKernel::batch_normalization_qs16(const Window &window)
-{
-    static_assert(!fused_activation, "Activation is not supported for QS16");
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    // Hold information about the current feature map we are iterating.
-    // Only compute denominator and NEON vectors once per feature map.
-    int slice = -1;
-
-    const int  fixed_point_position = _input->info()->fixed_point_position();
-    const auto input_mean           = reinterpret_cast<const qint16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var            = reinterpret_cast<const qint16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma          = (_gamma != nullptr) ? reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta           = (_beta != nullptr) ? reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    qint16x8_t       mean_vec    = vdupq_n_qs16(0);
-    qint16x8_t       var_vec     = vdupq_n_qs16(0);
-    qint16x8_t       gamma_vec   = vdupq_n_qs16(sqcvt_qs16_f32(1, fixed_point_position));
-    qint16x8_t       beta_vec    = vdupq_n_qs16(sqcvt_qs16_f32(0, fixed_point_position));
-    qint16x8_t       denominator = vdupq_n_qs16(0);
-    const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position));
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(slice != id.z())
-        {
-            // Conctruct vectors
-            mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
-            var_vec  = vdupq_n_qs16(*(input_var + id.z()));
-            if(input_gamma != nullptr)
-            {
-                gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
-            }
-            if(input_beta != nullptr)
-            {
-                beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
-            }
-
-            // Calculate denominator
-            denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
-            slice       = id.z();
-        }
-
-        // Calculate x bar and store results
-        const qint16x8_t numerator = vqsubq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), mean_vec);
-        const qint16x8_t x_bar     = vqmulq_qs16(numerator, denominator, fixed_point_position);
-        vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmlaq_qs16(beta_vec, x_bar, gamma_vec, fixed_point_position));
-    },
-    input, output);
-}
-
-template <bool fused_activation>
 void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
 {
     static_assert(!fused_activation, "Activation is not supported for FP16");
@@ -406,12 +296,6 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
     const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
     switch(_input->info()->data_type())
     {
-        case DataType::QS8:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs8<false>;
-            break;
-        case DataType::QS16:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16<false>;
-            break;
         case DataType::F16:
             _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false> : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false>;
             break;
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 9fda65feb4..d09d174e4f 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,8 +50,8 @@ TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_d
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16, DataType::QS16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
 
@@ -60,7 +60,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, convolved_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
index b3746bddf2..e581f221a3 100644
--- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
@@ -65,7 +65,7 @@ void NEConvertFullyConnectedWeightsKernel::configure(const ITensor *input, ITens
 Status NEConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
                                                       DataLayout data_layout)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::QS32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 891a03c5cc..38443ca4a8 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,10 +41,6 @@ using namespace arm_compute;
 namespace
 {
 // Overloads of 128-bit vector loads
-uint8x16_t loadq(const uint8_t *ptr)
-{
-    return vld1q_u8(ptr);
-}
 uint16x8_t loadq(const uint16_t *ptr)
 {
     return vld1q_u16(ptr);
@@ -54,10 +50,6 @@ uint32x4_t loadq(const uint32_t *ptr)
     return vld1q_u32(ptr);
 }
 // Overloads of 128-bit vector stores
-void storeq(uint8_t *ptr, uint8x16_t val)
-{
-    return vst1q_u8(ptr, val);
-}
 void storeq(uint16_t *ptr, uint16x8_t val)
 {
     return vst1q_u16(ptr, val);
@@ -107,9 +99,8 @@ BorderSize NEDepthConcatenateLayerKernel::border_size() const
 
 void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
@@ -129,10 +120,6 @@ void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int
 
     switch(input->info()->data_type())
     {
-        case DataType::QS8:
-            _func = &depth_concat<uint8_t>;
-            break;
-        case DataType::QS16:
         case DataType::F16:
             _func = &depth_concat<uint16_t>;
             break;
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index c29cb57513..8280b52fcb 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,13 +40,13 @@ class Coordinates;
 } // namespace arm_compute
 
 NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
-    : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0)
+    : _input(nullptr), _output(nullptr), _policy(), _shift(0)
 {
 }
 
 void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16);
 
     _input  = input;
     _output = input;
@@ -58,48 +58,26 @@ void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, Conve
         // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
         set_shape_if_empty(*output->info(), input->info()->tensor_shape());
 
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
         // Set output
         _output = output;
     }
 
-    // Set initial fixed point position of input and output
-    _fixed_point_position_input  = input->info()->fixed_point_position();
-    _fixed_point_position_output = _output->info()->fixed_point_position();
-
-    // Set the fixed point position to the output tensor if needed
-    if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type()))
-    {
-        // If in-place set the fixed point position of the output tensor to be equal to shift
-        _fixed_point_position_output = (_input == _output) ? static_cast<int>(_shift) : _fixed_point_position_output;
-        // Set fixed point position to output tensor
-        _output->info()->set_fixed_point_position(_fixed_point_position_output);
-    }
-
-    ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type())));
+    ARM_COMPUTE_ERROR_ON(shift >= 8);
     ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
 
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
                                                                             && output->info()->data_type() != DataType::S32),
                              "Only data_types supported [in] U8 -> [out] U16, S16, S32");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32),
-                             "Only data_types supported [in] QS8 ->  [out] QS8, F32");
-
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
                              "Only data_types supported [in] U16 ->  [out] U8, U32");
 
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
                              "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32),
-                             "Only data_types supported [in] QS16 ->  [out] QS16, F32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
-                             "Only data_types supported [in] F32 ->  [out] QS8, QS16");
-
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
     // Configure kernel window
@@ -132,8 +110,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    bool in_place = (_input == _output);
-
     switch(_input->info()->data_type())
     {
         case DataType::U8:
@@ -212,49 +188,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info
             }
             break;
         }
-        case DataType::QS8:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::QS8:
-                {
-                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
-                    /* Fixed point position conversion QS8 -> QS8 */
-                    if(relative_shift != 0 || !in_place)
-                    {
-                        const auto relative_shift_vec = vdupq_n_qs8(relative_shift);
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
-                            vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec));
-                        },
-                        input, output);
-                    }
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion QS8 -> F32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
-
-                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input);
-                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input);
-
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
         case DataType::S16:
         {
             switch(_output->info()->data_type())
@@ -408,116 +341,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info
             }
             break;
         }
-        case DataType::QS16:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::QS16:
-                {
-                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
-                    /* Fixed point position conversion QS16 -> QS16 */
-                    if(relative_shift != 0 || !in_place)
-                    {
-                        const auto relative_shift_vec = vdupq_n_qs16(relative_shift);
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const qint16x8x2_t texels_qs16 =
-                            {
-                                {
-                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr())),
-                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
-                                }
-                            };
-                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec));
-                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec));
-                        },
-                        input, output);
-                    }
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion QS16 -> F32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const int16x8x2_t texels_qs16 =
-                        {
-                            {
-                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
-                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
-                            }
-                        };
-
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input));
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
-        case DataType::F32:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::QS8:
-                {
-                    /* Down-conversion F32 -> QS8 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const float32x4x4_t texels_f32 =
-                        {
-                            {
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
-                            }
-                        };
-
-                        const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output);
-
-                        vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
-                    },
-                    input, output);
-                    break;
-                }
-                case DataType::QS16:
-                {
-                    /* Down-conversion F32 -> QS16 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const float32x4x2_t texels_f32_1 =
-                        {
-                            {
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
-                            }
-                        };
-                        const float32x4x2_t texels_f32_2 =
-                        {
-                            {
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
-                            }
-                        };
-
-                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output));
-                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output));
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
         default:
             ARM_COMPUTE_ERROR("Not supported");
     }
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 8cdf175d8a..09728e2a8d 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -115,7 +115,7 @@ public:
                     in_top += delta_input, in_mid += delta_input, in_low += delta_input,
                     p_out += num_elems_written_per_iteration)
                 {
-                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0, input_offset);
+                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
                     store_results<stridex>(p_out, vres);
                 }
             }
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index cfd8eacfdd..5b43e2b14f 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -122,7 +122,6 @@ void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, c
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != output->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 8960d8a8af..86a6d1c1a8 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -89,7 +89,6 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input     = input;
     _output    = output;
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 36b17bfc4c..47fcf12874 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -88,7 +88,6 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
@@ -96,7 +95,6 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o
     if(biases != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 4120e5f87a..47c895c594 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
 {
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0);
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
 
     constexpr unsigned int num_elems_processed_per_iteration = 8;
 
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 5eafdf0363..54a046846a 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -43,34 +43,6 @@ using namespace arm_compute::detail;
 
 namespace
 {
-template <unsigned int stridex>
-qint16x8_t internal_vld1q(const qint16_t *in);
-
-template <>
-qint16x8_t internal_vld1q<1>(const qint16_t *in)
-{
-    return vld1q_qs16(in);
-}
-
-template <>
-qint16x8_t internal_vld1q<2>(const qint16_t *in)
-{
-    const int16x8x2_t tmp = vld2q_s16(in);
-    return tmp.val[0];
-}
-
-template <>
-qint16x8_t internal_vld1q<3>(const qint16_t *in)
-{
-    const int16x8x3_t tmp = vld3q_s16(in);
-    return tmp.val[0];
-}
-
-inline qint16x8_t internal_vdupq_n(qint16_t v)
-{
-    return vdupq_n_qs16(v);
-}
-
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <unsigned int stridex>
 float16x8_t internal_vld1q(const float16_t *in);
@@ -105,15 +77,13 @@ inline void internal_vst1q(float16_t *p, const float16x8_t &v)
     vst1q_f16(p, v);
 }
 
-float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y, int fixed_point_position)
+float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
     return vmulq_f16(x, y);
 }
 
-inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z, int fixed_point_position)
+inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
     return vaddq_f16(x, vmulq_f16(y, z));
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -151,107 +121,16 @@ inline void internal_vst1q(float *p, const float32x4_t &v)
     vst1q_f32(p, v);
 }
 
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
     return vmulq_f32(x, y);
 }
 
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
     return vmlaq_f32(x, y, z);
 }
 
-template <unsigned int stridex>
-qint8x8_t internal_vld1q(const qint8_t *in);
-
-template <>
-qint8x8_t internal_vld1q<1>(const qint8_t *in)
-{
-    return vld1_qs8(in);
-}
-
-template <>
-qint8x8_t internal_vld1q<2>(const qint8_t *in)
-{
-    const qint8x8x2_t tmp = vld2_s8(in);
-    return tmp.val[0];
-}
-
-template <>
-qint8x8_t internal_vld1q<3>(const qint8_t *in)
-{
-    const qint8x8x3_t tmp = vld3_s8(in);
-    return tmp.val[0];
-}
-
-inline qint8x8_t internal_vdupq_n(qint8_t v)
-{
-    return vdup_n_qs8(v);
-}
-
-inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
-{
-    return vmull_qs8(x, y, fixed_point_position);
-}
-
-inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
-{
-    return vqmlal_qs8(x, y, z, fixed_point_position);
-}
-
-inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
-{
-    vst1q_qs16(p, v);
-}
-
-inline void internal_vst1q(int32_t *p, const qint32x4x2_t &v)
-{
-    vst1q_s32(p, v.val[0]);
-    vst1q_s32(p + 4, v.val[1]);
-}
-
-template <unsigned int stridex>
-qint32x4x2_t internal_vld1q(const qint32_t *in);
-
-template <>
-qint32x4x2_t internal_vld1q<1>(const qint32_t *in)
-{
-    const qint32x4x2_t r =
-    {
-        {
-            vld1q_s32(in),
-            vld1q_s32(in + 4)
-        }
-    };
-    return r;
-}
-
-inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position)
-{
-    const qint32x4x2_t r =
-    {
-        {
-            vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position),
-            vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position),
-        }
-    };
-    return r;
-}
-
-inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position)
-{
-    const qint32x4x2_t r =
-    {
-        {
-            vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position),
-            vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position)
-        }
-    };
-    return r;
-}
-
 constexpr int small_tensor_size_optim = 8;
 inline bool run_optim_small_tensor_info(const ITensorInfo *t)
 {
@@ -355,21 +234,20 @@ public:
     static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
     {
-        const int          input_stride_x       = input->info()->strides_in_bytes().x();
-        const int          input_stride_y       = input->info()->strides_in_bytes().y();
-        const int          input_stride_z       = input->info()->strides_in_bytes().z();
-        const int          output_stride_y      = output->info()->strides_in_bytes().y();
-        const int          output_stride_z      = output->info()->strides_in_bytes().z();
-        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
-        const int          output_w             = output->info()->dimension(0);
-        const int          output_h             = output->info()->dimension(1);
-        const int          range_z              = window.z().end() - window.z().start();
-        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left        = conv_info.pad_left();
-        const unsigned int conv_pad_top         = conv_info.pad_top();
-        const int          fixed_point_position = input->info()->fixed_point_position();
+        const int          input_stride_x  = input->info()->strides_in_bytes().x();
+        const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          input_stride_z  = input->info()->strides_in_bytes().z();
+        const int          output_stride_y = output->info()->strides_in_bytes().y();
+        const int          output_stride_z = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_w        = output->info()->dimension(0);
+        const int          output_h        = output->info()->dimension(1);
+        const int          range_z         = window.z().end() - window.z().start();
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
 
         // setup output window for the iterator
         Window window_out = window;
@@ -414,7 +292,7 @@ public:
                         auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
                         {
-                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
                         }
                     }
                 }
@@ -431,7 +309,7 @@ public:
                         auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
                         {
-                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
                         }
                     }
                 }
@@ -469,7 +347,7 @@ void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
 
 template <unsigned int stridex>
 float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position);
+                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
 
 inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
 {
@@ -511,9 +389,8 @@ inline float32x4x3_t load_input(const float *const in)
 
 template <>
 inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
     const float32x4x3_t vin0 = load_input(in_0);
     const float32x4x3_t vin1 = load_input(in_1);
     const float32x4x3_t vin2 = load_input(in_2);
@@ -601,10 +478,9 @@ inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const
 
 template <>
 inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -613,9 +489,9 @@ inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const
 
 template <>
 inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
 {
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }
@@ -642,28 +518,6 @@ void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
     vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
 }
 
-template <unsigned int stridex>
-void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);
-
-template <>
-void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
-    vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));
-}
-
-template <>
-void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
-}
-
-template <>
-void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
-}
-
 template <typename T1>
 class convolver_nhwc
 {
@@ -745,7 +599,7 @@ public:
                                 const auto we_addr   = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
                                 const auto we_values = internal_vld1q<1>(we_addr);
 
-                                out_values = internal_vmlal(out_values, in_values, we_values, 0);
+                                out_values = internal_vmlal(out_values, in_values, we_values);
                             }
 
                             out_val += out_values[0];
@@ -784,24 +638,23 @@ public:
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
     {
         ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x       = input->info()->strides_in_bytes().x();
-        const int          input_stride_y       = input->info()->strides_in_bytes().y();
-        const int          input_stride_z       = input->info()->strides_in_bytes().z();
-        const int          output_stride_y      = output->info()->strides_in_bytes().y();
-        const int          output_stride_z      = output->info()->strides_in_bytes().z();
-        const int          kernel_stride_x      = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y      = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
-        const int          output_w             = output->info()->dimension(0);
-        const int          output_h             = output->info()->dimension(1);
-        const int          num_planes_z         = window.z().end() - window.z().start();
-        const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
-        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left        = conv_info.pad_left();
-        const unsigned int conv_pad_top         = conv_info.pad_top();
-        const int          fixed_point_position = input->info()->fixed_point_position();
+        const int          input_stride_x  = input->info()->strides_in_bytes().x();
+        const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          input_stride_z  = input->info()->strides_in_bytes().z();
+        const int          output_stride_y = output->info()->strides_in_bytes().y();
+        const int          output_stride_z = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_w        = output->info()->dimension(0);
+        const int          output_h        = output->info()->dimension(1);
+        const int          num_planes_z    = window.z().end() - window.z().start();
+        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
 
         // setup output window for the iterator
         Window window_out = window;
@@ -864,7 +717,7 @@ public:
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                             in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
                         {
-                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2);
                             store_results<stridex>(p_out, vres);
                         }
                     }
@@ -889,7 +742,7 @@ public:
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                             in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
                         {
-                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2);
                             accumulate_results<stridex>(p_out, vres);
                         }
                     }
@@ -908,24 +761,23 @@ public:
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
     {
         ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x       = input->info()->strides_in_bytes().x();
-        const int          input_stride_y       = input->info()->strides_in_bytes().y();
-        const int          input_stride_z       = input->info()->strides_in_bytes().z();
-        const int          output_stride_y      = output->info()->strides_in_bytes().y();
-        const int          output_stride_z      = output->info()->strides_in_bytes().z();
-        const int          kernel_stride_x      = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y      = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
-        const int          output_w             = output->info()->dimension(0);
-        const int          output_h             = output->info()->dimension(1);
-        const int          num_planes_z         = window.z().end() - window.z().start();
-        const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
-        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left        = conv_info.pad_left();
-        const unsigned int conv_pad_top         = conv_info.pad_top();
-        const int          fixed_point_position = input->info()->fixed_point_position();
+        const int          input_stride_x  = input->info()->strides_in_bytes().x();
+        const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          input_stride_z  = input->info()->strides_in_bytes().z();
+        const int          output_stride_y = output->info()->strides_in_bytes().y();
+        const int          output_stride_z = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_w        = output->info()->dimension(0);
+        const int          output_h        = output->info()->dimension(1);
+        const int          num_planes_z    = window.z().end() - window.z().start();
+        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
 
         // setup output window for the iterator
         Window window_out = window;
@@ -976,7 +828,7 @@ public:
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                             in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
                         {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
                             store_results<stridex>(p_out, vres);
                         }
                     }
@@ -1001,7 +853,7 @@ public:
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                             in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
                         {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
                             accumulate_results<stridex>(p_out, vres);
                         }
                     }
@@ -1120,7 +972,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
     const DataLayout data_layout = input->data_layout();
@@ -1140,11 +992,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
 
         DataType data_type = input->data_type();
-        if(is_data_type_fixed_point(data_type))
-        {
-            // Promote data type in case of fixed point
-            data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
-        }
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type);
@@ -1180,11 +1027,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                     case DataType::F16:
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    case DataType::QS8:
-                    case DataType::QS16:
                         num_elems_written_per_iteration = 8;
                         break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
                     case DataType::F32:
                         if(run_optim_small_tensor_info(input))
                         {
@@ -1215,13 +1060,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                         break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                     case DataType::F16:
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    case DataType::QS8:
-                    case DataType::QS16:
                         num_weight_elems_read_per_row   = 8 + kernel_size - 1;
                         num_elems_read_per_iteration    = 24;
                         num_elems_written_per_iteration = 32 >> conv_stride_x;
                         break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
                     default:
                         ARM_COMPUTE_ERROR("Data type not supported.");
                         break;
@@ -1315,14 +1158,8 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
 
     DataType data_type = input->info()->data_type();
 
-    if(is_data_type_fixed_point(data_type))
-    {
-        // Promote data type in case of fixed point
-        data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
-    }
-
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), output_shape, 1, data_type);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info));
@@ -1371,12 +1208,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
             {
                 switch(_input->info()->data_type())
                 {
-                    case DataType::QS8:
-                        convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                        break;
-                    case DataType::QS16:
-                        convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                        break;
                     case DataType::F32:
                         convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                         break;
@@ -1395,9 +1226,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
             {
                 switch(_input->info()->data_type())
                 {
-                    case DataType::QS8:
-                        convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                        break;
                     case DataType::F32:
                         convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                         break;
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 37a3804289..e4cd4d0465 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -45,22 +45,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8,
-                                                         DataType::QS16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8,
+                                                         DataType::F16,
                                                          DataType::QS32, DataType::S32, DataType::F32);
 
     if(bias != nullptr)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::S32, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::QS32, DataType::S32, DataType::F32);
 
-        if(is_data_type_fixed_point(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
-        }
-        else if(is_data_type_quantized_asymmetric(input->data_type()))
+        if(is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -80,17 +73,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     // Checks performed when output is configured
     if((output != nullptr) && (output->total_size() != 0))
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
-        if(is_data_type_fixed_point(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && output->data_type() != DataType::QS8, "Wrong data type for output");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && output->data_type() != DataType::QS16, "Wrong data type for output");
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-        }
-        else if(is_data_type_quantized_asymmetric(output->data_type()))
+        if(is_data_type_quantized_asymmetric(output->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && output->data_type() != DataType::QASYMM8, "Wrong data type for bias");
         }
@@ -168,81 +154,24 @@ inline float32x4_t internal_vld1q(const float *in)
 {
     return vld1q_f32(in);
 }
-inline qint8x16_t internal_vld1q(const qint8_t *in)
-{
-    return vld1q_qs8(in);
-}
-inline qint16x8_t internal_vld1q(const qint16_t *in)
-{
-    return vld1q_qs16(in);
-}
-inline qint32x4_t internal_vld1q(const qint32_t *in)
-{
-    return vld1q_s32(in);
-}
 
 // Internal store
 inline void internal_vst1q(float *p, const float32x4_t &v)
 {
     vst1q_f32(p, v);
 }
-inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
-{
-    vst1q_qs8(p, v);
-}
-inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
-{
-    vst1_qs8(p, vqmovn_s16(v));
-}
-inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
-{
-    vst1q_qs16(p, v);
-}
-inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
-{
-    vst1q_s32(p, v);
-}
-
-inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
-{
-    vst1_qs16(p, vqmovn_qs32(v));
-}
 
 // Internal vdup
 inline float32x4_t internal_vdupq_n(float v)
 {
     return vdupq_n_f32(v);
 }
-inline qint8x16_t internal_vdupq_n(qint8_t v)
-{
-    return vdupq_n_qs8(v);
-}
-inline qint16x8_t internal_vdupq_n(qint16_t v)
-{
-    return vdupq_n_qs16(v);
-}
-inline qint32x4_t internal_vdupq_n(qint32_t v)
-{
-    return vdupq_n_qs32(v);
-}
 
 // Internal vadd
 inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
 {
     return vaddq_f32(x, y);
 }
-inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
-{
-    return vqaddq_qs8(x, y);
-}
-inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
-{
-    return vqaddq_qs16(x, y);
-}
-inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
-{
-    return vqaddq_qs32(x, y);
-}
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 inline float16x8_t internal_vld1q(const float16_t *in)
@@ -494,39 +423,6 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
     {
         switch(input->info()->data_type())
         {
-            case DataType::QS8:
-            {
-                if(bias == nullptr)
-                {
-                    _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
-                }
-                else
-                {
-                    _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
-                }
-                break;
-            }
-            case DataType::QS16:
-            {
-                if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
-                {
-                    _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
-                }
-                else if(bias == nullptr)
-                {
-                    _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
-                }
-                else
-                {
-                    ARM_COMPUTE_ERROR("Not implemented");
-                }
-                break;
-            }
-            case DataType::QS32:
-            {
-                _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
-                break;
-            }
             case DataType::S32:
             {
                 _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 747b8b1bfe..3d08cafa93 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -105,8 +105,8 @@ NEFillBorderKernel::NEFillBorderKernel()
 
 void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QASYMM8,
-                                                  DataType::QS16, DataType::U16, DataType::S16,
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QASYMM8,
+                                                  DataType::U16, DataType::S16,
                                                   DataType::U32, DataType::S32,
                                                   DataType::F16, DataType::F32);
 
@@ -147,7 +147,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                 case DataType::U8:
                     fill_constant_value_single_channel<uint8_t>(window);
                     break;
-                case DataType::QS8:
                 case DataType::S8:
                     fill_constant_value_single_channel<int8_t>(window);
                     break;
@@ -155,7 +154,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                     fill_constant_value_single_channel<uint16_t>(window);
                     break;
                 case DataType::S16:
-                case DataType::QS16:
                     fill_constant_value_single_channel<int16_t>(window);
                     break;
                 case DataType::U32:
@@ -192,7 +190,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                 case DataType::U8:
                     fill_replicate_single_channel<uint8_t>(window);
                     break;
-                case DataType::QS8:
                 case DataType::S8:
                     fill_replicate_single_channel<int8_t>(window);
                     break;
@@ -200,7 +197,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                     fill_replicate_single_channel<uint16_t>(window);
                     break;
                 case DataType::S16:
-                case DataType::QS16:
                     fill_replicate_single_channel<int16_t>(window);
                     break;
                 case DataType::U32:
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 72b652d5dc..872ac2661e 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ void NEFloorKernel::configure(const ITensor *input, ITensor *output)
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Auto initialize output
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 12755a45f8..6519a39b9c 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -44,11 +44,10 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
-                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     if(output->total_size() != 0)
     {
@@ -57,7 +56,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
         output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index cab3c7a58f..421a6f0ef9 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -43,9 +43,8 @@ namespace
 {
 inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum);
     ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
 
@@ -161,33 +160,6 @@ void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadI
             break;
         }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::QS8:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const qint8x16_t accum  = vld1q_qs8(reinterpret_cast<const qint8_t *>(in0_out.ptr()));
-                const qint8x16_t biases = vld1q_qs8(reinterpret_cast<const qint8_t *>(in1.ptr()));
-
-                vst1q_qs8(reinterpret_cast<qint8_t *>(in0_out.ptr()), vqaddq_qs8(accum, biases));
-            },
-            in0_out, in1);
-            break;
-        }
-        case DataType::QS16:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                qint16x8x2_t       accum  = vld2q_s16(reinterpret_cast<const qint16_t *>(in0_out.ptr()));
-                const qint16x8x2_t biases = vld2q_s16(reinterpret_cast<const qint16_t *>(in1.ptr()));
-
-                accum.val[0] = vqaddq_qs16(accum.val[0], biases.val[0]);
-                accum.val[1] = vqaddq_qs16(accum.val[1], biases.val[1]);
-
-                vst2q_s16(reinterpret_cast<qint16_t *>(in0_out.ptr()), accum);
-            },
-            in0_out, in1);
-            break;
-        }
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index dfba74355b..d02504329a 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,54 +91,6 @@ void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &wi
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
-{
-    const int        fixed_point_position = input->info()->fixed_point_position();
-    const qint8x16_t beta_qs8             = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position));
-
-    Iterator in(input, window);
-    Iterator out(output, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const qint8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<qint8_t *>(out.ptr());
-
-        qint8x16_t       alpha_ab = vld1q_qs8(out_ptr);
-        const qint8x16_t c        = vld1q_qs8(in_ptr);
-
-        // Multiply matrix C by its weight and accumulate
-        alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position);
-
-        vst1q_qs8(out_ptr, alpha_ab);
-    },
-    in, out);
-}
-
-void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
-{
-    const int        fixed_point_position = input->info()->fixed_point_position();
-    const qint16x8_t beta_qs16            = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position));
-
-    Iterator in(input, window);
-    Iterator out(output, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const qint16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<qint16_t *>(out.ptr());
-
-        qint16x8x2_t       alpha_ab = vld2q_s16(out_ptr);
-        const qint16x8x2_t c        = vld2q_s16(in_ptr);
-
-        // Multiply matrix C by its weight and accumulate
-        alpha_ab.val[0] = vqmlaq_qs16(alpha_ab.val[0], c.val[0], beta_qs16, fixed_point_position);
-        alpha_ab.val[1] = vqmlaq_qs16(alpha_ab.val[1], c.val[1], beta_qs16, fixed_point_position);
-
-        vst2q_s16(out_ptr, alpha_ab);
-    },
-    in, out);
-}
 } // namespace
 
 NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
@@ -148,10 +100,9 @@ NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
 
 void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
 
@@ -160,12 +111,6 @@ void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output
         case DataType::F32:
             _func = &matrix_addition_f32;
             break;
-        case DataType::QS8:
-            _func = &matrix_addition_qs8;
-            break;
-        case DataType::QS16:
-            _func = &matrix_addition_qs16;
-            break;
         case DataType::F16:
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             _func = &matrix_addition_f16;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 69b052a9bd..196398a2de 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -356,263 +356,6 @@ void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT
 }
 
 template <bool multiply_alpha>
-void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
-{
-    const auto width_matrix_b       = static_cast<int>(output->info()->dimension(0));
-    const auto in_b_stride          = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
-    const auto num_elems_vec_a      = static_cast<int>(input0->info()->dimension(0));
-    const int  fixed_point_position = input0->info()->fixed_point_position();
-
-    // The implementation computes 32 elements per iteration
-    const int window_start_x = 32 * info.thread_id;
-    const int window_step_x  = 32 * info.num_threads;
-    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, win_out);
-
-    execute_window_loop(win_out, [&](const Coordinates & id)
-    {
-        if(id.x() > width_matrix_b)
-        {
-            return;
-        }
-
-        // Reset accumulators
-        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
-
-        auto vec_a    = reinterpret_cast<const qint8_t *>(ina.ptr());
-        auto matrix_b = reinterpret_cast<const qint8_t *>(inb.ptr());
-
-        auto vec_a_end_addr = vec_a + num_elems_vec_a;
-        for(; vec_a <= (vec_a_end_addr - 2);)
-        {
-            const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0);
-            const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1);
-
-            const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride);
-            const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride);
-            const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride);
-            const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride);
-            const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride);
-            const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride);
-            const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride);
-            const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride);
-
-            // First accumulation
-            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
-            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
-            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
-            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
-
-            // Second accumulation
-            acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position);
-            acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position);
-            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position);
-            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position);
-
-            vec_a += 2;
-            matrix_b += 2 * in_b_stride;
-        }
-
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const qint8x8_t a0 = vld1_dup_qs8(vec_a);
-
-            const qint8x8_t b00 = vld1_qs8(matrix_b + 0);
-            const qint8x8_t b01 = vld1_qs8(matrix_b + 8);
-            const qint8x8_t b02 = vld1_qs8(matrix_b + 16);
-            const qint8x8_t b03 = vld1_qs8(matrix_b + 24);
-
-            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
-            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
-            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
-            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
-
-            vec_a += 1;
-            matrix_b += in_b_stride;
-        }
-
-        // Convert back to qint8x8_t and saturate
-        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
-        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
-        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
-        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
-
-        // Multiply by the weight of the matrix product (alpha)
-        if(multiply_alpha)
-        {
-            const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
-            acc00_qs8                 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
-            acc01_qs8                 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
-            acc02_qs8                 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
-            acc03_qs8                 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
-
-        // Store 8x4 output elements
-        vst1_qs8(mtx_out0 + 0, acc00_qs8);
-        vst1_qs8(mtx_out0 + 8, acc01_qs8);
-        vst1_qs8(mtx_out0 + 16, acc02_qs8);
-        vst1_qs8(mtx_out0 + 24, acc03_qs8);
-    },
-    ina, inb, out);
-}
-
-template <bool multiply_alpha>
-void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
-{
-    const auto width_matrix_b       = static_cast<int>(output->info()->dimension(0));
-    const auto in_b_stride          = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
-    const auto num_elems_vec_a      = static_cast<int>(input0->info()->dimension(0));
-    const int  fixed_point_position = input0->info()->fixed_point_position();
-
-    // The implementation computes 16 elements per iteration
-    const int window_start_x = 16 * info.thread_id;
-    const int window_step_x  = 16 * info.num_threads;
-    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
-
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, win_out);
-
-    execute_window_loop(win_out, [&](const Coordinates & id)
-    {
-        if(id.x() > width_matrix_b)
-        {
-            return;
-        }
-
-        // Reset accumulators
-        qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc02_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc03_qs32 = vdupq_n_qs32(0);
-
-        auto vec_a    = reinterpret_cast<const qint16_t *>(ina.ptr());
-        auto matrix_b = reinterpret_cast<const qint16_t *>(inb.ptr());
-
-        auto vec_a_end_addr = vec_a + num_elems_vec_a;
-        for(; vec_a <= (vec_a_end_addr - 2);)
-        {
-            const qint16x4_t a0 = vld1_dup_qs16(vec_a + 0);
-            const qint16x4_t a1 = vld1_dup_qs16(vec_a + 1);
-
-            const qint16x4_t b00 = vld1_qs16(matrix_b + 0 + 0 * in_b_stride);
-            const qint16x4_t b01 = vld1_qs16(matrix_b + 4 + 0 * in_b_stride);
-            const qint16x4_t b02 = vld1_qs16(matrix_b + 8 + 0 * in_b_stride);
-            const qint16x4_t b03 = vld1_qs16(matrix_b + 12 + 0 * in_b_stride);
-            const qint16x4_t b10 = vld1_qs16(matrix_b + 0 + 1 * in_b_stride);
-            const qint16x4_t b11 = vld1_qs16(matrix_b + 4 + 1 * in_b_stride);
-            const qint16x4_t b12 = vld1_qs16(matrix_b + 8 + 1 * in_b_stride);
-            const qint16x4_t b13 = vld1_qs16(matrix_b + 12 + 1 * in_b_stride);
-
-            // First accumulation
-            acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
-            acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
-            acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
-            acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
-
-            // Second accumulation
-            acc00_qs32 = vqmlal_qs16(acc00_qs32, b10, a1, fixed_point_position);
-            acc01_qs32 = vqmlal_qs16(acc01_qs32, b11, a1, fixed_point_position);
-            acc02_qs32 = vqmlal_qs16(acc02_qs32, b12, a1, fixed_point_position);
-            acc03_qs32 = vqmlal_qs16(acc03_qs32, b13, a1, fixed_point_position);
-
-            vec_a += 2;
-            matrix_b += 2 * in_b_stride;
-        }
-
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const qint16x4_t a0 = vld1_dup_qs16(vec_a);
-
-            const qint16x4_t b00 = vld1_qs16(matrix_b + 0);
-            const qint16x4_t b01 = vld1_qs16(matrix_b + 4);
-            const qint16x4_t b02 = vld1_qs16(matrix_b + 8);
-            const qint16x4_t b03 = vld1_qs16(matrix_b + 12);
-
-            acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
-            acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
-            acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
-            acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
-
-            vec_a += 1;
-            matrix_b += in_b_stride;
-        }
-
-        // Convert back to qint16x4_t and saturate
-        qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
-        qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
-        qint16x4_t acc02_qs16 = vqmovn_qs32(acc02_qs32);
-        qint16x4_t acc03_qs16 = vqmovn_qs32(acc03_qs32);
-
-        // Multiply by the weight of the matrix product (alpha)
-        if(multiply_alpha)
-        {
-            const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
-            acc00_qs16                  = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
-            acc01_qs16                  = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
-            acc02_qs16                  = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);
-            acc03_qs16                  = vqmul_qs16(acc03_qs16, alpha_qs16, fixed_point_position);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
-
-        // Store 16x4 output elements
-        vst1_qs16(mtx_out0 + 0, acc00_qs16);
-        vst1_qs16(mtx_out0 + 4, acc01_qs16);
-        vst1_qs16(mtx_out0 + 8, acc02_qs16);
-        vst1_qs16(mtx_out0 + 12, acc03_qs16);
-    },
-    ina, inb, out);
-}
-
-template <bool multiply_alpha>
 void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
 {
     const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
@@ -1063,361 +806,12 @@ void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, IT
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <bool multiply_alpha>
-void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
-{
-    const size_t    in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
-    const size_t    out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
-    const size_t    out_stride2          = out_stride1 * 2;
-    const size_t    out_stride3          = out_stride1 * 3;
-    const int       num_elems_matrix_b_x = input1->info()->dimension(0);
-    const int       fixed_point_position = input0->info()->fixed_point_position();
-    const qint8x8_t alpha_qs8            = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
-    ARM_COMPUTE_UNUSED(alpha_qs8);
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
-    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, window);
-
-    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
-    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
-    // All the values needed for computing a single 32x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto mtx_a0 = reinterpret_cast<const qint8_t *>(ina.ptr());
-        auto mtx_b0 = reinterpret_cast<const qint8_t *>(inb.ptr());
-        auto mtx_b1 = mtx_b0 + in_b_stride;
-
-        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc10_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc20_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc30_qs16 = vdupq_n_qs16(0);
-
-        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc11_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc21_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc31_qs16 = vdupq_n_qs16(0);
-
-        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc12_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc22_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc32_qs16 = vdupq_n_qs16(0);
-
-        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc13_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc23_qs16 = vdupq_n_qs16(0);
-        qint16x8_t acc33_qs16 = vdupq_n_qs16(0);
-
-        int k = 0;
-        // This for loop performs 2 accumulations
-        for(; k <= (num_elems_matrix_b_x - 32); k += 32)
-        {
-            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
-            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
-            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
-            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
-            const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4);
-            const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5);
-            const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6);
-            const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7);
-
-            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
-            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
-            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
-            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
-
-            // First accumulation
-            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
-            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
-            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
-            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
-            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
-            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
-            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
-            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
-
-            const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16);
-            const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24);
-            const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16);
-            const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24);
-
-            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
-            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
-            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
-            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
-            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
-            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
-            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
-            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-            // Second accumulation
-            acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
-            acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position);
-            acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position);
-            acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position);
-            acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position);
-            acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position);
-            acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position);
-            acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position);
-            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position);
-            acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position);
-            acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position);
-            acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position);
-            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position);
-            acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position);
-            acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position);
-            acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position);
-
-            mtx_a0 += 8;
-            mtx_b0 += 32;
-            mtx_b1 += 32;
-        }
-
-        // This for loop performs the left over accumulations
-        for(; k < num_elems_matrix_b_x; k += 16)
-        {
-            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
-            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
-            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
-            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
-
-            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
-            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
-            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
-            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
-
-            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
-            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
-            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
-            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
-            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
-            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
-            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
-            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
-            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
-            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
-            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
-            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
-            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
-            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
-            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
-            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
-
-            mtx_a0 += 4;
-            mtx_b0 += 16;
-            mtx_b1 += 16;
-        }
-
-        // Convert back to qint8x8_t and saturate
-        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
-        qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16);
-        qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16);
-        qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16);
-
-        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
-        qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16);
-        qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16);
-        qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16);
-
-        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
-        qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16);
-        qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16);
-        qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16);
-
-        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
-        qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16);
-        qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16);
-        qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16);
-
-        // Multiply by the weight of the matrix product (alpha)
-        if(multiply_alpha)
-        {
-            acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
-            acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position);
-            acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position);
-            acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position);
-            acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
-            acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position);
-            acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position);
-            acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position);
-            acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
-            acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position);
-            acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position);
-            acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position);
-            acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
-            acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position);
-            acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position);
-            acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
-
-        // Store 32x4 output elements
-        vst1_qs8(mtx_out0 + 0, acc00_qs8);
-        vst1_qs8(mtx_out0 + 8, acc01_qs8);
-        vst1_qs8(mtx_out0 + 16, acc02_qs8);
-        vst1_qs8(mtx_out0 + 24, acc03_qs8);
-        vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8);
-        vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8);
-        vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8);
-        vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8);
-        vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8);
-        vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8);
-        vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8);
-        vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8);
-        vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8);
-        vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8);
-        vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8);
-        vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8);
-    },
-    ina, inb, out);
-}
-
-template <bool multiply_alpha>
-void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
-{
-    const size_t     in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
-    const size_t     out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
-    const size_t     out_stride2          = out_stride1 * 2;
-    const size_t     out_stride3          = out_stride1 * 3;
-    const int        num_elems_matrix_b_x = input1->info()->dimension(0);
-    const int        fixed_point_position = input0->info()->fixed_point_position();
-    const qint16x4_t alpha_qs16           = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
-    ARM_COMPUTE_UNUSED(alpha_qs16);
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, window);
-
-    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
-    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 8x4 elements per iteration
-    // All the values needed for computing a single 8x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto mtx_a0 = reinterpret_cast<const qint16_t *>(ina.ptr());
-        auto mtx_b0 = reinterpret_cast<const qint16_t *>(inb.ptr());
-        auto mtx_b1 = mtx_b0 + in_b_stride;
-
-        qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc10_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc20_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc30_qs32 = vdupq_n_qs32(0);
-
-        qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc11_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc21_qs32 = vdupq_n_qs32(0);
-        qint32x4_t acc31_qs32 = vdupq_n_qs32(0);
-
-        // This for loop performs 1 accumulation
-        for(int k = 0; k <= (num_elems_matrix_b_x - 8); k += 8)
-        {
-            const qint16x4_t a0 = vld1_dup_qs16(mtx_a0 + 0);
-            const qint16x4_t a1 = vld1_dup_qs16(mtx_a0 + 1);
-            const qint16x4_t a2 = vld1_dup_qs16(mtx_a0 + 2);
-            const qint16x4_t a3 = vld1_dup_qs16(mtx_a0 + 3);
-
-            const qint16x4_t b00 = vld1_qs16(mtx_b0 + 0);
-            const qint16x4_t b01 = vld1_qs16(mtx_b0 + 4);
-
-            acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
-            acc10_qs32 = vqmlal_qs16(acc10_qs32, b00, a1, fixed_point_position);
-            acc20_qs32 = vqmlal_qs16(acc20_qs32, b00, a2, fixed_point_position);
-            acc30_qs32 = vqmlal_qs16(acc30_qs32, b00, a3, fixed_point_position);
-            acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
-            acc11_qs32 = vqmlal_qs16(acc11_qs32, b01, a1, fixed_point_position);
-            acc21_qs32 = vqmlal_qs16(acc21_qs32, b01, a2, fixed_point_position);
-            acc31_qs32 = vqmlal_qs16(acc31_qs32, b01, a3, fixed_point_position);
-
-            mtx_a0 += 4;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-        }
-
-        // Convert back to qint16x4_t and saturate
-        qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
-        qint16x4_t acc10_qs16 = vqmovn_qs32(acc10_qs32);
-        qint16x4_t acc20_qs16 = vqmovn_qs32(acc20_qs32);
-        qint16x4_t acc30_qs16 = vqmovn_qs32(acc30_qs32);
-
-        qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
-        qint16x4_t acc11_qs16 = vqmovn_qs32(acc11_qs32);
-        qint16x4_t acc21_qs16 = vqmovn_qs32(acc21_qs32);
-        qint16x4_t acc31_qs16 = vqmovn_qs32(acc31_qs32);
-
-        // Multiply by the weight of the matrix product (alpha)
-        if(multiply_alpha)
-        {
-            acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
-            acc10_qs16 = vqmul_qs16(acc10_qs16, alpha_qs16, fixed_point_position);
-            acc20_qs16 = vqmul_qs16(acc20_qs16, alpha_qs16, fixed_point_position);
-            acc30_qs16 = vqmul_qs16(acc30_qs16, alpha_qs16, fixed_point_position);
-            acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
-            acc11_qs16 = vqmul_qs16(acc11_qs16, alpha_qs16, fixed_point_position);
-            acc21_qs16 = vqmul_qs16(acc21_qs16, alpha_qs16, fixed_point_position);
-            acc31_qs16 = vqmul_qs16(acc31_qs16, alpha_qs16, fixed_point_position);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
-
-        // Store 8x4 output elements
-        vst1_qs16(mtx_out0 + 0, acc00_qs16);
-        vst1_qs16(mtx_out0 + 4, acc01_qs16);
-        vst1_qs16(mtx_out0 + out_stride1 + 0, acc10_qs16);
-        vst1_qs16(mtx_out0 + out_stride1 + 4, acc11_qs16);
-        vst1_qs16(mtx_out0 + out_stride2 + 0, acc20_qs16);
-        vst1_qs16(mtx_out0 + out_stride2 + 4, acc21_qs16);
-        vst1_qs16(mtx_out0 + out_stride3 + 0, acc30_qs16);
-        vst1_qs16(mtx_out0 + out_stride3 + 4, acc31_qs16);
-    },
-    ina, inb, out);
-}
-
 inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
 
     if(!is_interleaved)
     {
@@ -1428,7 +822,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
             ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
             ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
         }
     }
     else
@@ -1467,7 +860,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
             }
             ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
         }
     }
 
@@ -1492,16 +884,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
                 num_elems_processed_per_iteration_x = 16;
                 break;
             }
-            case DataType::QS8:
-            {
-                num_elems_processed_per_iteration_x = 32;
-                break;
-            }
-            case DataType::QS16:
-            {
-                num_elems_processed_per_iteration_x = 16;
-                break;
-            }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
@@ -1539,16 +921,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
                 num_elems_processed_per_iteration_x = 8;
                 break;
             }
-            case DataType::QS8:
-            {
-                num_elems_processed_per_iteration_x = 32;
-                break;
-            }
-            case DataType::QS16:
-            {
-                num_elems_processed_per_iteration_x = 8;
-                break;
-            }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
@@ -1638,18 +1010,6 @@ void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &inf
                 vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, info, _alpha);
                 break;
             }
-            case DataType::QS8:
-            {
-                multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, info, _alpha) :
-                vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, info, _alpha);
-                break;
-            }
-            case DataType::QS16:
-            {
-                multiply_alpha ? vector_matrix_multiply_qs16<true>(_input0, _input1, _output, window, info, _alpha) :
-                vector_matrix_multiply_qs16<false>(_input0, _input1, _output, window, info, _alpha);
-                break;
-            }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
@@ -1675,18 +1035,6 @@ void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &inf
                 matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
                 break;
             }
-            case DataType::QS8:
-            {
-                multiply_alpha ? matrix_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
-                matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
-                break;
-            }
-            case DataType::QS16:
-            {
-                multiply_alpha ? matrix_matrix_multiply_qs16<true>(_input0, _input1, _output, window, _alpha) :
-                matrix_matrix_multiply_qs16<false>(_input0, _input1, _output, window, _alpha);
-                break;
-            }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index c1e975e77e..8588f43edf 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -177,7 +177,6 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
     ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
     ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
 
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 5d6163d583..4517f46139 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,17 +54,15 @@ TensorShape get_output_shape(const ITensorInfo *input)
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
-                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
@@ -102,7 +100,7 @@ void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type());
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 86e3fd7a84..f03bc49ed3 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Size2D.h"
@@ -47,9 +46,8 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
                           bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
 
@@ -90,7 +88,6 @@ inline void linearize_volume(const uint8_t *const in_ptr,
                              int                  input_stride_x,
                              int                  input_stride_y,
                              int                  input_stride_z,
-                             int                  fixed_point_position,
                              int                  pad_value,
                              int                  dilation_x,
                              int                  dilation_y)
@@ -171,18 +168,7 @@ inline void linearize_volume(const uint8_t *const in_ptr,
     // Append 1 if the convolution layer has biases
     if(has_bias)
     {
-        if(std::is_same<T, qint8_t>::value)
-        {
-            *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position);
-        }
-        else if(std::is_same<T, qint16_t>::value)
-        {
-            *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position);
-        }
-        else
-        {
-            *out_ptr = static_cast<T>(1);
-        }
+        *out_ptr = static_cast<T>(1);
     }
 }
 } // namespace
@@ -251,7 +237,6 @@ void NEIm2ColKernel::run_generic(const Window &window)
                                       input_stride_x,
                                       input_stride_y,
                                       input_stride_z,
-                                      _input->info()->fixed_point_position(),
                                       offset,
                                       _dilation.x(),
                                       _dilation.y());
@@ -294,18 +279,7 @@ void NEIm2ColKernel::run_reduced(const Window &window)
         // Add bias
         if(_has_bias)
         {
-            if(std::is_same<T, qint8_t>::value)
-            {
-                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
-            }
-            else if(std::is_same<T, qint16_t>::value)
-            {
-                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
-            }
-            else
-            {
-                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
-            }
+            *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
         }
     }
     while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
@@ -366,12 +340,6 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
                 _func = &NEIm2ColKernel::run_reduced<float16_t>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::QS8:
-                _func = &NEIm2ColKernel::run_reduced<qint8_t>;
-                break;
-            case DataType::QS16:
-                _func = &NEIm2ColKernel::run_reduced<qint16_t>;
-                break;
             case DataType::QASYMM8:
                 _func = &NEIm2ColKernel::run_reduced<qasymm8_t>;
                 break;
@@ -392,12 +360,6 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::QS8:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
-                break;
-            case DataType::QS16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
-                break;
             case DataType::QASYMM8:
                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qasymm8_t, false> : &NEIm2ColKernel::run_generic<qasymm8_t, true>;
                 break;
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 91776d8100..ed037832af 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -103,7 +103,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
 
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal sum_access(sum, 0, num_elems_processed_per_iteration_sum);
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index 434f4eb3e9..d93dc09ff9 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -68,7 +68,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
     TensorShape output_shape = compute_min_max_shape(input);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+    auto_init_if_empty(*output, output_shape, 1, input->data_type());
 
     constexpr unsigned int num_elems_processed_per_iteration = 1;
 
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 776cb27d7a..253a93f196 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,26 +39,17 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
-    if(is_data_type_fixed_point(input->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared);
-        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
-        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
-        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
-    }
-
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
@@ -162,44 +153,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
             }
             break;
         }
-        case DataType::QS8:
-        {
-            switch(norm_info.type())
-            {
-                case NormType::IN_MAP_1D:
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, false>;
-                    break;
-                case NormType::IN_MAP_2D:
-                    // Normalize over X and Y
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, true>;
-                    break;
-                case NormType::CROSS_MAP:
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 2, false>;
-                    break;
-                default:
-                    break;
-            }
-            break;
-        }
-        case DataType::QS16:
-        {
-            switch(norm_info.type())
-            {
-                case NormType::IN_MAP_1D:
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, false>;
-                    break;
-                case NormType::IN_MAP_2D:
-                    // Normalize over X and Y
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, true>;
-                    break;
-                case NormType::CROSS_MAP:
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 2, false>;
-                    break;
-                default:
-                    break;
-            }
-            break;
-        }
         default:
             ARM_COMPUTE_ERROR("NOT SUPPORTED!");
     }
@@ -306,105 +259,6 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
     }
 }
 
-template <DataType dt, unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
-{
-    Iterator input(_input, window);
-    Iterator input_squared(_input_squared, window);
-    Iterator output(_output, window);
-
-    const int dim_y                = 1;
-    const int radius               = _norm_info.norm_size() / 2;
-    const int total_size           = _input->info()->dimension(dim) - 1;
-    const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
-    // We account padding across X only and we iterate over rows
-    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
-    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
-    const int min_top    = 0;
-    const int max_bottom = _input->info()->dimension(dim_y) - 1;
-
-    const int fixed_point_position = _input->info()->fixed_point_position();
-
-    if(dt == DataType::QS8)
-    {
-        const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
-        const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
-        const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Get range to normalize
-            const int current_row   = do_2D_norm ? id[dim_y] : 0;
-            const int current_slice = id[dim];
-            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-            const int first_slice   = std::max(current_slice - radius, min_left);
-            const int last_slice    = std::min(current_slice + radius, max_right);
-
-            // Accumulate 2D In-Map values
-            qint8x16_t accu = vdupq_n_qs8(0);
-            for(int j = first_row; j <= last_row; ++j)
-            {
-                // Compute row displacement
-                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-                for(int i = first_slice; i <= last_slice; ++i)
-                {
-                    accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
-                }
-            }
-
-            // Normalize
-            const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
-            const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
-            const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
-            vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
-        },
-        input, input_squared, output);
-    }
-    else if(dt == DataType::QS16)
-    {
-        const qint16x8_t coeff_vec = vdupq_n_qs16_f32(_norm_info.scale_coeff(), fixed_point_position);
-        const qint16x8_t beta_vec  = vdupq_n_qs16_f32(_norm_info.beta(), fixed_point_position);
-        const qint16x8_t kappa_vec = vdupq_n_qs16_f32(_norm_info.kappa(), fixed_point_position);
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Get range to normalize
-            const int current_row   = do_2D_norm ? id[dim_y] : 0;
-            const int current_slice = id[dim];
-            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-            const int first_slice   = std::max(current_slice - radius, min_left);
-            const int last_slice    = std::min(current_slice + radius, max_right);
-
-            // Accumulate 2D In-Map values
-            qint16x8_t accu = vdupq_n_qs16(0);
-            for(int j = first_row; j <= last_row; ++j)
-            {
-                // Compute row displacement
-                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-                for(int i = first_slice; i <= last_slice; ++i)
-                {
-                    accu = vqaddq_qs16(accu, vld1q_qs16(reinterpret_cast<const qint16_t *>(input_squared_ptr + i * input_squared_stride)));
-                }
-            }
-
-            // Normalize
-            const qint16x8_t accu_scale       = vqmlaq_qs16(kappa_vec, coeff_vec, accu, fixed_point_position);
-            const qint16x8_t normalized       = vqpowq_qs16(accu_scale, beta_vec, fixed_point_position);
-            const qint16x8_t normalized_pixel = vdivq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), normalized, fixed_point_position);
-            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), normalized_pixel);
-        },
-        input, input_squared, output);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
 Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index ae1d48cc69..e9bc8effc6 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp
@@ -45,8 +45,8 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16, DataType::QS16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)),
@@ -59,7 +59,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 193ca3799c..0ec7e823a1 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -61,9 +61,9 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                     "Output can only be U8 if both inputs are U8");
 
@@ -71,14 +71,6 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
-    if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
-    {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
-        // Check if scale is representable in fixed-point with the provided settings
-        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
-    }
-
     if(std::abs(scale - scale255_constant) < 0.00001f)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
@@ -120,11 +112,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         {
             set_format_if_unknown(*output, Format::F16);
         }
-        else if(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8)
-        {
-            set_data_type_if_unknown(*output, DataType::QS8);
-            set_fixed_point_position_if_zero(*output, input1->fixed_point_position());
-        }
     }
 
     // Configure kernel window
@@ -220,105 +207,6 @@ void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict in
 }
 
 template <bool is_scale255, bool is_sat>
-void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
-{
-    const auto output = static_cast<qint8_t *__restrict>(output_ptr);
-
-    const qint8x16_t ta1 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input1_ptr));
-    const qint8x16_t ta2 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input2_ptr));
-
-    if(is_scale255)
-    {
-        qint16x8_t       tmp1_high = vmovl_s8(vget_high_s8(ta1));
-        qint16x8_t       tmp1_low  = vmovl_s8(vget_low_s8(ta1));
-        const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2));
-        const qint16x8_t tmp2_low  = vmovl_s8(vget_low_s8(ta2));
-
-        const float32x4x2_t scale255_f32 =
-        {
-            {
-                scale255_constant_f32q,
-                scale255_constant_f32q
-            }
-        };
-        const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
-
-        tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position);
-        tmp1_low  = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position);
-        tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position);
-        tmp1_low  = vmulq_qs16(tmp1_low, scale255, fixed_point_position);
-
-        if(is_sat)
-        {
-            vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high)));
-        }
-        else
-        {
-            vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high)));
-        }
-    }
-    else
-    {
-        const qint8x16_t vn  = vdupq_n_s8(-n);
-        qint8x16_t       res = ta2;
-
-        if(is_sat)
-        {
-            res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn);
-        }
-        else
-        {
-            res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn);
-        }
-        vst1q_qs8(output, res);
-    }
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
-{
-    const qint16x8x2_t ta1 = vld2q_qs16(static_cast<const qint16_t *__restrict>(input1_ptr));
-    qint16x8x2_t       res = vld2q_qs16(static_cast<const qint16_t *__restrict>(input2_ptr));
-
-    if(is_scale255)
-    {
-        const float32x4x2_t scale255_f32 =
-        {
-            {
-                scale255_constant_f32q,
-                scale255_constant_f32q
-            }
-        };
-        const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
-        if(is_sat)
-        {
-            res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
-            res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
-        }
-        else
-        {
-            res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
-            res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
-        }
-    }
-    else
-    {
-        const qint16x8_t vn = vdupq_n_s16(-n);
-        if(is_sat)
-        {
-            res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
-            res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
-        }
-        else
-        {
-            res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
-            res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
-        }
-    }
-    vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
-}
-
-template <bool is_scale255, bool is_sat>
 inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
 {
     int32x4_t       tmp1_high = vmovl_s16(vget_high_s16(input1));
@@ -529,7 +417,7 @@ void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict
 } // namespace
 
 NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
-    : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+    : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
 {
 }
 
@@ -550,7 +438,6 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
     _scale          = scale;
     _scale_exponent = 0;
     _func_int       = nullptr;
-    _func_q_int     = nullptr;
     _func_float     = nullptr;
 
     bool is_scale_255 = false;
@@ -630,28 +517,6 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
             _func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
         }
     }
-    else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output)
-    {
-        if(is_scale_255)
-        {
-            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<true, true> : &mul_QS8_QS8_QS8_n<true, false>;
-        }
-        else
-        {
-            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
-        }
-    }
-    else if(DataType::QS16 == dt_input1 && DataType::QS16 == dt_input2 && DataType::QS16 == dt_output)
-    {
-        if(is_scale_255)
-        {
-            _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<true, true> : &mul_QS16_QS16_QS16_n<true, false>;
-        }
-        else
-        {
-            _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<false, true> : &mul_QS16_QS16_QS16_n<false, false>;
-        }
-    }
     else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
     {
         _func_float = &mul_F16_F16_F16_n<false, false>;
@@ -724,17 +589,6 @@ void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo
         },
         input1, input2, output);
     }
-    else if(_func_q_int != nullptr)
-    {
-        int fixed_point_position = _input1->info()->fixed_point_position();
-        execute_window_loop(collapsed, [&](const Coordinates & id)
-        {
-            (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
-            collapsed.slide_window_slice_3D(slice_input1);
-            collapsed.slide_window_slice_3D(slice_input2);
-        },
-        input1, input2, output);
-    }
     else
     {
         ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 7877cf5cc0..e586b72d30 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -25,7 +25,6 @@
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
@@ -79,32 +78,6 @@ inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, c
     return 1.f / ((end_y - start_y) * (end_x - start_x));
 }
 
-inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
-                                      int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
-{
-    static const std::array<qint8_t, 10> scale_values_q8 =
-    { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
-    const int start_x = id.x() * stride_x - pad_x;
-    const int start_y = id.y() * stride_y - pad_y;
-    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
-    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
-    const int val     = ((end_y - start_y) * (end_x - start_x));
-    return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
-}
-
-inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
-                                        int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
-{
-    static std::array<qint16_t, 10> scale_values_q16 =
-    { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
-    const int start_x = id.x() * stride_x - pad_x;
-    const int start_y = id.y() * stride_y - pad_y;
-    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
-    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
-    const int val     = ((end_y - start_y) * (end_x - start_x));
-    return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
-}
-
 template <bool exclude_padding>
 inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
                                const int pool_size, const int upper_bound_w, const int upper_bound_h,
@@ -163,22 +136,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     int                 pool_stride_y   = 0;
     PoolingType         pool_type       = pool_info.pool_type();
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
-    const bool          exclude_padding = pool_info.exclude_padding();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
     static const std::set<int> supported_pool_sizes = { 2, 3 };
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
 
     ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8))
                                 && (pool_type != PoolingType::MAX));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
 
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
                                     || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
@@ -236,22 +205,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     {
         switch(input->data_type())
         {
-            case DataType::QS8:
-                num_elems_read_per_iteration = 16;
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
-                        break;
-                    case 3:
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
-                        break;
-                    default:
-                        break;
-                }
-                break;
             case DataType::QASYMM8:
                 if(is_nhwc)
                 {
@@ -274,22 +227,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                         break;
                 }
                 break;
-            case DataType::QS16:
-                num_elems_read_per_iteration = 8;
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 4 : 8;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
-                        break;
-                    case 3:
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 4 : 8;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
-                        break;
-                    default:
-                        break;
-                }
-                break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
                 if(is_nhwc)
@@ -462,64 +399,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     const DataType data_type = input->info()->data_type();
     const bool     is_nchw   = data_layout == DataLayout::NCHW;
 
-    // Select appropriate function
-    if(data_type == DataType::QS8)
-    {
-        if(_is_square)
-        {
-            switch(pool_size_x)
-            {
-                case 2:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::AVG>;
-                            break;
-                        case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::MAX>;
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                case 3:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::AVG>;
-                            break;
-                        case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::MAX>;
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                default:
-                    switch(pool_type)
-                    {
-                        case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-            }
-        }
-        else
-        {
-            switch(pool_type)
-            {
-                case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
-            }
-        }
-    }
-    else if(data_type == DataType::QASYMM8)
+    if(data_type == DataType::QASYMM8)
     {
         if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
         {
@@ -606,62 +486,6 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
             }
         }
     }
-    else if(data_type == DataType::QS16)
-    {
-        if(_is_square)
-        {
-            switch(pool_size_x)
-            {
-                case 2:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::AVG>;
-                            break;
-                        case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::MAX>;
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                case 3:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::AVG>;
-                            break;
-                        case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::MAX>;
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                default:
-                    switch(pool_type)
-                    {
-                        case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-            }
-        }
-        else
-        {
-            switch(pool_type)
-            {
-                case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
-            }
-        }
-    }
     else if(data_type == DataType::F16)
     {
         if(_is_square)
@@ -1022,71 +846,6 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     INEKernel::configure(win_config.second);
 }
 
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    const int     fixed_point_position = _input->info()->fixed_point_position();
-    constexpr int pool_size            = 2;
-    int           pool_stride_x        = 0;
-    int           pool_stride_y        = 0;
-    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
-    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
-    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
-    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
-        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
-        qint8x8_t  lower_res   = {};
-        qint8x8_t  upper_res   = {};
-        if(pooling_type == PoolingType::AVG)
-        {
-            // Calculate scale
-            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
-            const qint8x8_t scale_vec = vdup_n_qs8(scale);
-
-            // Perform pooling
-            const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
-            lower_res                 = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
-            if(pool_stride_x == 1)
-            {
-                const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
-                upper_res                         = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
-            }
-        }
-        else
-        {
-            const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
-            lower_res                 = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
-            if(pool_stride_x == 1)
-            {
-                const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
-                upper_res                         = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
-            }
-        }
-        if(pool_stride_x == 1)
-        {
-            const qint8x8x2_t res = { { lower_res, upper_res } };
-            vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
-        }
-        else
-        {
-            vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
-        }
-    },
-    input, output);
-}
-
 template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window)
 {
@@ -1201,71 +960,6 @@ void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, con
     input, output);
 }
 
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q16_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    const int     fixed_point_position = _input->info()->fixed_point_position();
-    constexpr int pool_size            = 2;
-    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
-    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
-    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
-    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
-    int           pool_stride_x        = 0;
-    int           pool_stride_y        = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
-        const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
-        qint16x4_t lower_res   = {};
-        qint16x4_t upper_res   = {};
-        if(pooling_type == PoolingType::AVG)
-        {
-            // Calculate scale
-            const qint16_t   scale     = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
-            const qint16x4_t scale_vec = vdup_n_qs16(scale);
-
-            // Perform pooling
-            const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
-            lower_res                 = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
-            if(pool_stride_x == 1)
-            {
-                const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
-                upper_res                         = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
-            }
-        }
-        else
-        {
-            const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
-            lower_res                 = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
-            if(pool_stride_x == 1)
-            {
-                const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
-                upper_res                         = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
-            }
-        }
-        if(pool_stride_x == 1)
-        {
-            const qint16x4x2_t res = { { lower_res, upper_res } };
-            vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
-        }
-        else
-        {
-            vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
-        }
-    },
-    input, output);
-}
-
 template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window)
 {
@@ -1461,82 +1155,6 @@ void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const W
     input, output);
 }
 
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    const int     fixed_point_position = _input->info()->fixed_point_position();
-    constexpr int pool_size            = 3;
-    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
-    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
-    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
-    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
-    int           pool_stride_x        = 0;
-    int           pool_stride_y        = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
-        const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
-        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
-        qint8x8_t  res         = {};
-        if(pooling_type == PoolingType::AVG)
-        {
-            // Calculate scale
-            const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
-
-            // Perform pooling for stride 2
-            const qint8x16_t sum_data  = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
-            const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
-            const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
-            const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
-            if(pool_stride_x == 2)
-            {
-                const qint8x8x2_t      table      = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
-                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                const qint8x8_t        scale_vec  = vdup_n_qs8(scale);
-                res                               = vtbl2_s8(table, lookup_val);
-                res                               = vqmul_qs8(res, scale_vec, fixed_point_position);
-                vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
-            }
-            else
-            {
-                const qint8x16_t scale_vec = vdupq_n_qs8(scale);
-                vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
-            }
-        }
-        else
-        {
-            const qint8x16_t max_data  = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
-            const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
-            const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
-            const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
-
-            if(pool_stride_x == 2)
-            {
-                const qint8x8x2_t      table      = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
-                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                res                               = vtbl2_s8(table, lookup_val);
-                vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
-            }
-            else
-            {
-                vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
-            }
-        }
-    },
-    input, output);
-}
-
 template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window)
 {
@@ -1657,77 +1275,6 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
     input, output);
 }
 
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q16_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    const int     fixed_point_position = _input->info()->fixed_point_position();
-    constexpr int pool_size            = 3;
-    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
-    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
-    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
-    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
-    int           pool_stride_x        = 0;
-    int           pool_stride_y        = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
-        const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
-        const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
-
-        if(pooling_type == PoolingType::AVG)
-        {
-            // Calculate scale
-            const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
-
-            // Perform pooling for stride 2
-            const qint16x8_t sum_data  = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
-            const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
-            const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
-            const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
-            if(pool_stride_x == 2)
-            {
-                const qint16x4_t tmp       = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
-                const qint16x4_t scale_vec = vdup_n_qs16(scale);
-                vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
-            }
-            else
-            {
-                const qint16x8_t scale_vec = vdupq_n_qs16(scale);
-                vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
-            }
-        }
-        else
-        {
-            const qint16x8_t max_data  = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
-            const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
-            const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
-            const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
-
-            if(pool_stride_x == 2)
-            {
-                const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
-                vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
-            }
-            else
-            {
-                vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
-            }
-        }
-    },
-    input, output);
-}
-
 template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window)
 {
@@ -1879,110 +1426,6 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W
     input, output);
 }
 
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    const int pool_size_x   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
-    const int pool_size_y   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
-    const int pool_pad_top  = _pool_info.pad_stride_info().pad_top();
-    const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        qint8x16_t vres = {};
-        qint8_t    res  = {};
-
-        //PoolingType::MAX
-        for(int y = 0; y < pool_size_y; ++y)
-        {
-            int x = 0;
-            for(; x <= (pool_size_x - 16); x += 16)
-            {
-                const qint8x16_t data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
-                                                                                    (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
-                vres                  = vmaxq_s8(vres, data);
-            }
-
-            // Leftover for loop
-            for(; x < pool_size_x; ++x)
-            {
-                qint8_t data = *(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
-                res          = std::max(res, data);
-            }
-        }
-        //Reduce
-        const qint8x8_t half_vres = vpmax_s8(vget_low_s8(vres), vget_high_s8(vres));
-        res                       = std::max(res, vget_lane_s8(half_vres, 0));
-        res                       = std::max(res, vget_lane_s8(half_vres, 1));
-        res                       = std::max(res, vget_lane_s8(half_vres, 2));
-        res                       = std::max(res, vget_lane_s8(half_vres, 3));
-        res                       = std::max(res, vget_lane_s8(half_vres, 4));
-        res                       = std::max(res, vget_lane_s8(half_vres, 5));
-        res                       = std::max(res, vget_lane_s8(half_vres, 6));
-        res                       = std::max(res, vget_lane_s8(half_vres, 7));
-
-        // Store result
-        *(reinterpret_cast<qint8_t *>(output.ptr())) = res;
-    },
-    input, output);
-}
-
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q16_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    const int pool_size_x   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
-    const int pool_size_y   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
-    const int pool_pad_top  = _pool_info.pad_stride_info().pad_top();
-    const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        qint16x8_t vres = {};
-        qint16_t   res  = {};
-
-        //PoolingType::MAX
-        for(int y = 0; y < pool_size_y; ++y)
-        {
-            int x = 0;
-            for(; x <= (pool_size_x - 8); x += 8)
-            {
-                const qint16x8_t data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
-                                                                                      (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
-                vres                  = vmaxq_s16(vres, data);
-            }
-
-            // Leftover for loop
-            for(; x < pool_size_x; ++x)
-            {
-                qint16_t data = *(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
-                res           = std::max(res, data);
-            }
-        }
-        //Reduce
-        const qint16x4_t half_vres = vpmax_s16(vget_low_s16(vres), vget_high_s16(vres));
-        res                        = std::max(res, vget_lane_s16(half_vres, 0));
-        res                        = std::max(res, vget_lane_s16(half_vres, 1));
-        res                        = std::max(res, vget_lane_s16(half_vres, 2));
-        res                        = std::max(res, vget_lane_s16(half_vres, 3));
-
-        // Store result
-        *(reinterpret_cast<qint16_t *>(output.ptr())) = res;
-    },
-    input, output);
-}
-
 template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window)
 {
@@ -2688,8 +2131,6 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         unsigned int window_x_inc = 0;
         switch(_input->info()->data_type())
         {
-            case DataType::QS8:
-            case DataType::QS16:
             case DataType::F16:
             {
                 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index ee23e76c5c..b49400ab7d 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
 {
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0);
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
 
     constexpr unsigned int num_elems_processed_per_iteration = 8;
 
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index a209a523d3..4d908db77b 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *r
 
     // Output auto inizialitation if not yet initialized
     TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 30d42fa25f..30f21bbf33 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -134,7 +134,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
     const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+    auto_init_if_empty(*output, output_shape, 1, input->data_type());
 
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
 
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 45ba68d9fa..d6f470445f 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -59,11 +59,10 @@ inline void reshape_tensor(const Window &window, const ITensor *input, ITensor *
 
 void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
                                                   DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
 
     _input  = input;
@@ -94,12 +93,10 @@ void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
         case DataType::U8:
         case DataType::S8:
         case DataType::QASYMM8:
-        case DataType::QS8:
             reshape_tensor<uint8_t>(window, _input, _output);
             break;
         case DataType::U16:
         case DataType::S16:
-        case DataType::QS16:
         case DataType::F16:
             reshape_tensor<uint16_t>(window, _input, _output);
             break;
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index d91efd267f..9946f002de 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -194,56 +194,7 @@ T sqadd(T a, T b);
 template <typename T>
 T sqsub(T a, T b);
 template <typename T>
-T sqmul(T a, T b, int fixed_point_position);
-
-#define DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(TYPET, TYPEU, TAGT, TAGU)                                        \
-    inline vec_8_byte_t<TYPET> vqsub(vec_8_byte_t<TYPET> a, vec_8_byte_t<TYPET> b)                              \
-    {                                                                                                           \
-        return vqsub_##TAGT(a, b);                                                                              \
-    }                                                                                                           \
-    inline vec_8_byte_t<TYPEU> vqadd(vec_8_byte_t<TYPEU> a, vec_8_byte_t<TYPEU> b)                              \
-    {                                                                                                           \
-        return vqadd_##TAGU(a, b);                                                                              \
-    }                                                                                                           \
-    inline vec_16_byte_t<TYPEU> vqadd(vec_16_byte_t<TYPEU> a, vec_16_byte_t<TYPEU> b)                           \
-    {                                                                                                           \
-        return vqaddq_##TAGU(a, b);                                                                             \
-    }                                                                                                           \
-    inline vec_8_byte_t<TYPET> vqexp(vec_8_byte_t<TYPET> vec, int fixed_point_position)                         \
-    {                                                                                                           \
-        return vqexp_q##TAGT(vec, fixed_point_position);                                                        \
-    }                                                                                                           \
-    inline auto vmovl(vec_8_byte_t<TYPET> vec)->decltype(vmovl_##TAGT(vec))                                     \
-    {                                                                                                           \
-        return vmovl_##TAGT(vec);                                                                               \
-    }                                                                                                           \
-    inline vec_16_byte_t<TYPET> vqrecip(vec_16_byte_t<TYPET> vec, int fixed_point_position)                     \
-    {                                                                                                           \
-        return vqrecipq_q##TAGT(vec, fixed_point_position);                                                     \
-    }                                                                                                           \
-    inline vec_16_byte_t<TYPET> vqmul(vec_16_byte_t<TYPET> a, vec_16_byte_t<TYPET> b, int fixed_point_position) \
-    {                                                                                                           \
-        return vqmulq_q##TAGT(a, b, fixed_point_position);                                                      \
-    }                                                                                                           \
-    template <>                                                                                                 \
-    inline TYPEU sqadd<TYPEU>(TYPEU a, TYPEU b)                                                                 \
-    {                                                                                                           \
-        return sqadd_q##TAGU(a, b);                                                                             \
-    }                                                                                                           \
-    inline TYPET sqexp(TYPET val, int fixed_point_position)                                                     \
-    {                                                                                                           \
-        return sqexp_q##TAGT(val, fixed_point_position);                                                        \
-    }                                                                                                           \
-    template <>                                                                                                 \
-    inline TYPET sqsub<TYPET>(TYPET a, TYPET b)                                                                 \
-    {                                                                                                           \
-        return sqsub_q##TAGT(a, b);                                                                             \
-    }                                                                                                           \
-    template <>                                                                                                 \
-    inline TYPET sqmul<TYPET>(TYPET a, TYPET b, int fixed_point_position)                                       \
-    {                                                                                                           \
-        return sqmul_q##TAGT(a, b, fixed_point_position);                                                       \
-    }
+T sqmul(T a, T b);
 
 #define DECLARE_NEON_FUNCTIONS_FOR_FLOAT(TYPE, TAG)                               \
     inline vec_8_byte_t<TYPE> vadd(vec_8_byte_t<TYPE> a, vec_8_byte_t<TYPE> b)    \
@@ -278,9 +229,6 @@ DECLARE_NEON_FUNCTIONS_FOR_TYPE(float16_t, f16)
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 DECLARE_NEON_FUNCTIONS_FOR_TYPE(float, f32)
 
-DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int8_t, int16_t, s8, s16)
-DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int16_t, int32_t, s16, s32)
-
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float16_t, f16)
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -373,16 +321,15 @@ namespace
 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
     // Validate in case of configured output
     if(output.total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
     }
@@ -395,7 +342,7 @@ std::pair<Status, Window> validate_and_configure_window_logits_1d_max(ITensorInf
     // Softmax across the x dimension
     const TensorShape output_shape = TensorShape(input.tensor_shape()).set(0, 1);
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(output, output_shape, 1, input.data_type(), input.fixed_point_position(), input.quantization_info());
+    auto_init_if_empty(output, output_shape, 1, input.data_type(), input.quantization_info());
 
     // Configure kernel window
     const int input_width                       = input.valid_region().shape.x();
@@ -488,12 +435,6 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
         case DataType::QASYMM8:
             _func = &logits_1d_max<qasymm8_t>;
             break;
-        case DataType::QS8:
-            _func = &logits_1d_max<qint8_t>;
-            break;
-        case DataType::QS16:
-            _func = &logits_1d_max<qint16_t>;
-            break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             _func = &logits_1d_max<float16_t>;
@@ -543,11 +484,12 @@ namespace
 Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max,
                                          const ITensorInfo &output, const float beta, const ITensorInfo &tmp)
 {
+    ARM_COMPUTE_UNUSED(beta);
     // Check input
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type());
@@ -555,7 +497,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor
     // Check max
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &max);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max);
 
     // Check output if configured
@@ -564,19 +505,14 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor
         const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info();
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization);
     }
 
-    // Check beta
-    ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input.data_type()));
-
     // Check tmp if configured
     if(tmp.total_size() != 0)
     {
         const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type();
         ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &tmp);
         // We could potentially reduce tmp memory if we could predict or make an assumption
         // on the maximum number of threads that will run in parallel.
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp);
@@ -727,88 +663,6 @@ void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *cons
     in_it, max_it, out_it);
 }
 
-template <typename T, typename U>
-void logits_1d_softmax_fixed_point(const ITensor &in, const ITensor &max, void *const tmp,
-                                   ITensor &out, const float /*beta*/, const Window &window)
-{
-    const int start_x     = in.info()->valid_region().anchor.x();
-    const int input_width = in.info()->valid_region().shape.x();
-
-    const int fixed_point_position = in.info()->fixed_point_position();
-
-    Iterator in_it(&in, window);
-    Iterator max_it(&max, window);
-    Iterator out_it(&out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
-        vec_16_byte_t<T> vec_sum_inversed;
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = vdup_n<vec_8_byte_t<T>>(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum = vdup_n<vec_16_byte_t<U>>(0);
-
-            /* Loop over row and compute exponentials and sum */
-            int           i        = 0;
-            constexpr int vec_size = vec_size_of(vec_sum);
-            for(; i <= (input_width - vec_size); i += vec_size)
-            {
-                auto vec_elements = vld<vec_8_byte_t<T>>(in_ptr + i);
-                vec_elements      = vqsub(vec_elements, vec_max);
-                vec_elements      = vqexp(vec_elements, fixed_point_position);
-                vec_sum           = vqadd(vec_sum, vmovl(vec_elements));
-                vst(tmp_ptr + i, vec_elements);
-            }
-            /* Reduce sum */
-            const vec_8_byte_t<U> sum_8_byte = vqadd(vget_high(vec_sum), vget_low(vec_sum));
-            U                     sum        = reduce_add(sqadd<U>, sum_8_byte);
-
-            /* Run remaining elements */
-            for(; i < input_width; ++i)
-            {
-                T element  = sqexp(sqsub(in_ptr[i], max_val), fixed_point_position);
-                sum        = sqadd<U>(sum, element);
-                tmp_ptr[i] = element;
-            }
-
-            const auto qsum  = utility::saturate_cast<T>(sum);
-            vec_sum_inversed = vqrecip(vdup_n<vec_16_byte_t<T>>(qsum), fixed_point_position);
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int           i        = 0;
-            constexpr int vec_size = vec_size_of(vec_sum_inversed);
-            for(; i <= (input_width - vec_size); i += vec_size)
-            {
-                const auto             vec_in           = vld<vec_16_byte_t<T>>(tmp_ptr + i);
-                const vec_16_byte_t<T> normalized_value = vqmul(vec_in, vec_sum_inversed, fixed_point_position);
-                vst(out_ptr + i, normalized_value);
-            }
-
-            const T sum_inversed = vget_lane<0>(vec_sum_inversed);
-
-            /* Run remaining elements */
-            for(; i < input_width; ++i)
-            {
-                out_ptr[i] = sqmul(tmp_ptr[i], sum_inversed, fixed_point_position);
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
 template <typename T>
 void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp,
                              ITensor &out, const float beta, const Window &window)
@@ -908,12 +762,6 @@ void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max
         case DataType::QASYMM8:
             _func = &logits_1d_softmax_qasymm8;
             break;
-        case DataType::QS8:
-            _func = &logits_1d_softmax_fixed_point<qint8_t, qint16_t>;
-            break;
-        case DataType::QS16:
-            _func = &logits_1d_softmax_fixed_point<qint16_t, qint32_t>;
-            break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             _func = &logits_1d_softmax_float<float16_t>;
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 92271378ff..2630159561 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -74,7 +74,7 @@ unsigned int num_elems_processed(size_t element_size)
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16,
                                                          DataType::F32);
 
@@ -84,7 +84,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 3031a87637..f398409b26 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -105,14 +105,13 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     if(biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
@@ -124,7 +123,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
author	Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>	2018-07-04 09:34:00 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:54:10 +0000
commit	7485d5a62685cb745ab50e970adb722cb71557ac (patch)
tree	ba01b99ca466c93edc9a3f8c1e34394ff84be060 /src/core/NEON/kernels
parent	014333d73883c3872e458cedda5ccef586a7ccd4 (diff)
download	ComputeLibrary-7485d5a62685cb745ab50e970adb722cb71557ac.tar.gz