From 7485d5a62685cb745ab50e970adb722cb71557ac Mon Sep 17 00:00:00 2001
From: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Date: Wed, 4 Jul 2018 09:34:00 +0100
Subject: COMPMID-970 : Remove QS8 / QS16 support

Removed fixed point related code.

Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 .../core/NEON/kernels/NEActivationLayerKernel.h    |  17 +-
 .../core/NEON/kernels/NEArithmeticAdditionKernel.h |  20 +-
 .../NEON/kernels/NEArithmeticSubtractionKernel.h   |  20 +-
 .../NEON/kernels/NEBatchNormalizationLayerKernel.h |  21 +-
 arm_compute/core/NEON/kernels/NECol2ImKernel.h     |   4 +-
 .../kernels/NEConvertFullyConnectedWeightsKernel.h |   4 +-
 .../NEON/kernels/NEDepthConcatenateLayerKernel.h   |   2 +-
 .../core/NEON/kernels/NEDepthConvertLayerKernel.h  |  13 +-
 .../NEON/kernels/NEDirectConvolutionLayerKernel.h  |   8 +-
 .../NEDirectConvolutionLayerOutputStageKernel.h    |   8 +-
 arm_compute/core/NEON/kernels/NEFillBorderKernel.h |   2 +-
 .../core/NEON/kernels/NEFillInnerBorderKernel.h    |   2 +-
 .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h  |   6 +-
 .../kernels/NEGEMMMatrixAccumulateBiasesKernel.h   |   4 +-
 .../core/NEON/kernels/NEGEMMMatrixAdditionKernel.h |   4 +-
 .../core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h |   4 +-
 .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h   |   4 +-
 arm_compute/core/NEON/kernels/NEIm2ColKernel.h     |   4 +-
 .../core/NEON/kernels/NENormalizationLayerKernel.h |  16 +-
 arm_compute/core/NEON/kernels/NEPermuteKernel.h    |   4 +-
 .../NEON/kernels/NEPixelWiseMultiplicationKernel.h |  24 +-
 .../core/NEON/kernels/NEPoolingLayerKernel.h       |  50 +----
 .../core/NEON/kernels/NEReshapeLayerKernel.h       |   2 +-
 .../core/NEON/kernels/NESoftmaxLayerKernel.h       |   8 +-
 arm_compute/core/NEON/kernels/NETransposeKernel.h  |   6 +-
 .../core/NEON/kernels/NEWeightsReshapeKernel.h     |   4 +-
 .../NEON/kernels/detail/NEDirectConvolution3x3.h   |  14 +-
 .../kernels/detail/NEDirectConvolutionDetail.h     | 249 ++++-----------------
 28 files changed, 121 insertions(+), 403 deletions(-)

(limited to 'arm_compute/core/NEON/kernels')

diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index 06a0a01782..0290e32085 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -24,7 +24,6 @@
 #ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
 #define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
 
-#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/QAsymm8.h"
 
@@ -59,7 +58,7 @@ public:
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     *                                 of the activation function. Data types supported: QASYMM8/F16/F32.
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      activation_info Activation layer information.
      */
@@ -67,7 +66,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     *                     of the activation function. Data types supported: QASYMM8/F16/F32.
      * @param[in] output   Destination tensor info. Data type supported: same as @p input
      * @param[in] act_info Activation layer information.
      *
@@ -99,24 +98,12 @@ private:
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, float16_t>::value, void>::type activation(const Window &window);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
     /** Function to apply an activation function on a tensor.
      *
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type activation(const Window &window);
 
 private:
     ITensor                      *_input;
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index 155e792f5d..8cf21eae9d 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -57,26 +57,24 @@ public:
      * Valid configurations (Input1,Input2) -> Output :
      *
      *   - (U8,U8)     -> U8
-     *   - (QS8,QS8)   -> QS8
      *   - (U8,U8)     -> S16
      *   - (S16,U8)    -> S16
      *   - (U8,S16)    -> S16
      *   - (S16,S16)   -> S16
-     *   - (QS16,QS16) -> QS16
      *   - (F16,F16)   -> F16
      *   - (F32,F32)   -> F32
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
      *
-     * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in] policy Overflow policy.
      *
      * @return a status
@@ -90,9 +88,9 @@ public:
 private:
     /** Common signature for all the specialised add functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  window Region on which to execute the kernel.
      */
     using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index 73ecfcfeb5..3e93922b65 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -57,26 +57,24 @@ public:
      * Valid configurations (Input1,Input2) -> Output :
      *
      *   - (U8,U8)     -> U8
-     *   - (QS8,QS8)   -> QS8
      *   - (U8,U8)     -> S16
      *   - (S16,U8)    -> S16
      *   - (U8,S16)    -> S16
      *   - (S16,S16)   -> S16
-     *   - (QS16,QS16) -> QS16
      *   - (F16,F16)   -> F16
      *   - (F32,F32)   -> F32
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
      *
-     * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32
+     * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32
+     * @param[in] output Output tensor. Data types supported: U8/S16/F16/F32
      * @param[in] policy Policy to use to handle overflow.
      *
      * @return a status
@@ -89,9 +87,9 @@ public:
 private:
     /** Common signature for all the specialised sub functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  window Region on which to execute the kernel.
      */
     using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 2d33f87dfa..2a540c151b 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -57,7 +57,7 @@ public:
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                          The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     *                          The rest are optional and used for representing batches. Data types supported: F16/F32.
      * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
@@ -72,7 +72,7 @@ public:
      *
      * @param[in] input    Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
      *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
      * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
@@ -96,22 +96,7 @@ private:
     void configure_non_fused();
     /** Configure execution function in case of fused activation **/
     void configure_fused();
-    /** Template function to run batch normalization on 8-bit fixed point
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation>
-    void batch_normalization_qs8(const Window &window);
-    /** Template function to run batch normalization on 16-bit fixed point
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation>
-    void batch_normalization_qs16(const Window &window);
+
     /** Template function to run batch normalization on fp16
      *
      * @tparam fused_activation Boolean that flags if its a fused activation or not
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
index 9fb493cc4f..f02858e7d9 100644
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
@@ -72,7 +72,7 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
      *                            while the rest represent batch of outputs. Data types supported: Same as @p input
      * @param[in]  convolved_dims Output convolved dimensions.
@@ -80,7 +80,7 @@ public:
     void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
     /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel
      *
-     * @param[in] input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input          The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
      *                           while the rest represent batch of outputs. Data types supported: Same as @p input
      * @param[in] convolved_dims Output convolved dimensions.
diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
index 65ce764246..d5c9e3bbe9 100644
--- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
+++ b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
@@ -59,7 +59,7 @@ public:
     ~NEConvertFullyConnectedWeightsKernel() = default;
     /** Set the input and output tensor.
      *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
      * @param[in]  data_layout          The data layout the weights have been trained in.
@@ -67,7 +67,7 @@ public:
     void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel
      *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
      * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
      * @param[in] data_layout          The data layout the weights have been trained in.
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
index 67ef5293b7..12a5051ef8 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
@@ -55,7 +55,7 @@ public:
     ~NEDepthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]     input        Input tensor. Data types supported: F16/F32.
      * @param[in]     depth_offset The offset on the Z axis.
      * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
      *
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
index 50536f2b47..77bb0413ca 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
@@ -55,19 +55,12 @@ public:
      *
      * Valid conversions Input -> Output :
      *
-     *   - QS8 -> QS8, F32
      *   - U8 -> U16, S16, S32
      *   - U16 -> U8, U32
      *   - S16 -> U8, S32
-     *   - QS16 -> QS16, F32
-     *   - F32 -> QS8
      *
-     * @warning In case of in-place fixed point position conversion make sure that configure has been called
-     *          before the updated tensor is used in other functions, as the TensorInfo of the tensor will be
-     *          altered. In-place is only supported for QS8 -> QS8, QS16 -> QS16.
-     *
-     * @param[in, out] input  The input tensor to convert (Written in case of in-place computation). Data types supported: U8/QS8/U16/S16/F32.
-     * @param[out]     output The output tensor. Can be null in case of in-place computation. Data types supported: U8/QS8/U16/S16/U32/S32/F32.
+     * @param[in, out] input  The input tensor to convert (Written in case of in-place computation). Data types supported: U8/U16/S16.
+     * @param[out]     output The output tensor. Can be null in case of in-place computation. Data types supported: U8/U16/S16/U32/S32/F32.
      * @param[in]      policy Conversion policy.
      * @param[in]      shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
      *                         In case of fixed point position conversion, it specifies the new fixed point position, if operation is in-place.
@@ -82,8 +75,6 @@ private:
     ITensor      *_output;
     ConvertPolicy _policy;
     uint32_t      _shift;
-    int           _fixed_point_position_input;
-    int           _fixed_point_position_output;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index f859f97dae..589725ab01 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -57,24 +57,24 @@ public:
      *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
      *
      * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
      * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                       Data type supported:Same as @p input.
      * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
+     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
      *
      * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
      * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                      Data type supported:Same as @p input.
      * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS32/F16/F32
      * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
      *
      * @return a status
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
index 77711d7ecd..7fd1d70374 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
@@ -55,10 +55,10 @@ public:
     /** Set the accumulate buffer and the biases of the kernel.
      *
      * @param[in, out] input                        Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                                              Data type supported: QS16/QS32/F16/F32
+     *                                              Data type supported: QS32/F16/F32
      * @param[in]      bias                         (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[out]     output                       (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                                              Data type supported: QS8/QS16/F16/F32
+     *                                              Data type supported: F16/F32
      * @param[in]      result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
      * @param[in]      result_shift                 (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
      * @param[in]      result_offset_after_shift    (Optional)Offset to be applied to result before converting it back to QASYMM8
@@ -68,10 +68,10 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
      *
      * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                   Data type supported: QS16/QS32/F16/F32
+     *                   Data type supported: QS32/F16/F32
      * @param[in] bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                         Data type supported: QS8/QS16/F16/F32
+     *                         Data type supported: F16/F32
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr);
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
index dd19b8f35a..cff6b4ea2d 100644
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
@@ -57,7 +57,7 @@ public:
      *
      * @note This kernel fills the borders within the XY-planes.
      *
-     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QASYMM8/QS16/S16/S32/F32.
+     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QASYMM8/S16/S32/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
index 545a265dc2..2b6c7af72a 100644
--- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
@@ -57,7 +57,7 @@ public:
      *
      * @note This kernel fills the borders within the XY-planes.
      *
-     * @param[in,out] input                 Tensor to process. Data types supported: U8/QS8/S16/S32/F32.
+     * @param[in,out] input                 Tensor to process. Data types supported: U8/S16/S32/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 79504fd4da..5c0104d138 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -60,13 +60,13 @@ public:
     NEGEMMInterleave4x4Kernel();
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
      *
      * @return a status
@@ -79,7 +79,7 @@ public:
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
index e48a9a77e4..419a9f9150 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -51,13 +51,13 @@ public:
     ~NEGEMMMatrixAccumulateBiasesKernel() = default;
     /** Set the accumulate buffer and the biases of the kernel.
      *
-     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: QS8/QS16/F32
+     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
      * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
      */
     void configure(ITensor *accum, const ITensor *biases);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel
      *
-     * @param[in] accum  The accumulate tensor to convert. Data type supported: QS8/QS16/F32
+     * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
      * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
      *
      * @return a status
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
index 5e4f8b72ff..1a235933dc 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -59,7 +59,7 @@ public:
      *
      * @note The input and output tensor must have the same dimensions
      *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
      * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
      * @param[in]      beta   Weight of matrix C
      */
@@ -71,7 +71,7 @@ public:
 private:
     /** Common signature for all the matrix addition functions
      *
-     * @param[in]  input  An input tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  An input tensor. Data types supported: F16/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      * @param[in]  beta   Weight of matrix C
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
index d54522c678..6ee958205e 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -58,7 +58,7 @@ public:
      * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
      *       These two kernels change the layout of the original matrices to be more cache-friendly.
      *
-     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
      * @param[in]  input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
      *                            If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
      * @param[out] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
@@ -69,7 +69,7 @@ public:
     void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel
      *
-     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32
+     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
      * @param[in] input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
      *                           If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
      * @param[in] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index fcdd8dd93c..b7fbfcfcd2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -74,13 +74,13 @@ public:
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output Output tensor info. Data type supported: same as @p input.
      *
      * @return a status
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 5aa803f4fd..d455fd98b3 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -77,7 +77,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input              The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                                while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                                while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
      *                                Note: QASYMM8 works only for has_bias = false
      * @param[out] output             The output tensor. Data types supported: Same as @p input
      * @param[in]  kernel_dims        The kernel dimensions (width and height).
@@ -92,7 +92,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
      *
      * @param[in] input              The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                               while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                               while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
      *                               Note: QASYMM8 works only for has_bias = false
      * @param[in] output             The output tensor. Data types supported: Same as @p input
      * @param[in] kernel_dims        The kernel dimensions (width and height).
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index 6ae7b73423..92086437a6 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -54,7 +54,7 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32.
      * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           Data type supported: same as @p input
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -64,7 +64,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
      *
      * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                          and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
+     *                          and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32.
      * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                          Data type supported: same as @p input
      * @param[in] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -92,18 +92,6 @@ private:
     template <DataType dt, unsigned int dim, bool do_2D_norm>
     void normalize_float(const Window &window);
 
-    /** Function to perform normalization for fixed-point values depending on
-     * the given template dimension. The second template parameter specifies
-     * whether the normalization has to be 1D or 2D.
-     *
-     * @note Only supported normalizations are:
-     *  - 1D over X or Z
-     *  - 2D over X and Y
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <DataType dt, unsigned int dim, bool do_2D_norm>
-    void normalize_fixed_point(const Window &window);
     /** Common signature for all the specialised normalization functions
      *
      * @param[in] window Region on which to execute the kernel.
diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
index 68bbdcb3cb..b56faa8514 100644
--- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
@@ -58,7 +58,7 @@ public:
      *
      * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0]
      *
-     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data types supported: Same as @p input
      * @param[in]  perm   Permutation vector
      */
@@ -67,7 +67,7 @@ public:
      *
      * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0]
      *
-     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output The output tensor. Data types supported: Same as @p input
      * @param[in] perm   Permutation vector
      *
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 8c245569a5..41ea91495f 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -55,11 +55,10 @@ public:
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *       For QS8/QS16 scale = 1 is the only supported value.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2          An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in]  input1          An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2          An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy.
@@ -70,11 +69,10 @@ public:
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *       For QS8/QS16 scale = 1 is the only supported value.
      *
-     * @param[in] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] input2          An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in] input1          An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in] input2          An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy.
@@ -96,15 +94,6 @@ private:
      * @param[out] output_ptr Pointer to the output tensor.
      */
     using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
-    /** Common signature for all the specialised multiplication functions with fixed-point values
-     *
-     * @param[in]  input1_ptr           Pointer to the first input tensor.
-     * @param[in]  input2_ptr           Pointer to the second input tensor.
-     * @param[in]  scale                Scaling factor.
-     * @param[in]  fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number.
-     * @param[out] output_ptr           Pointer to the output tensor.
-     */
-    using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position);
     /** Common signature for all the specialised multiplication functions with float scaling factor
      *
      * @param[in]  input1_ptr Pointer to the first input tensor.
@@ -115,7 +104,6 @@ private:
 
     MulFunctionFloat *_func_float;
     MulFunctionInt   *_func_int;
-    MulFunctionQInt *_func_q_int;
 
 private:
     const ITensor *_input1;
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 4140ccf1ed..6c4c1db289 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -52,18 +52,18 @@ public:
     ~NEPoolingLayerKernel() = default;
     /** Set the input and output tensors.
      *
-     * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+     * @note F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in]  input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
      *
-     * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+     * @note F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in] input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] input     Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
@@ -90,13 +90,6 @@ private:
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling2_f16_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling for 8bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling2_q8_nchw(const Window &window_input, const Window &window);
     /** Function to perform 2x2 pooling for 8bit asymmetric fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -104,13 +97,6 @@ private:
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling2_qasymm8_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling for 16bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling2_q16_nchw(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -125,13 +111,6 @@ private:
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling3_f16_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 3x3 pooling for 8bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling3_q8_nchw(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling for 8bit quantized fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -139,13 +118,6 @@ private:
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling3_qasymm8_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 3x3 pooling for 16bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling3_q16_nchw(const Window &window_input, const Window &window);
     /** Function to perform 7x7 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -153,13 +125,6 @@ private:
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling7_f32_nchw(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 8bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void poolingMxN_q8_nchw(const Window &window_input, const Window &window);
     /** Function to perform MxN pooling for 8-bit quantized.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -174,13 +139,6 @@ private:
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 16bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void poolingMxN_q16_nchw(const Window &window_input, const Window &window);
     /** Function to perform MxN pooling for 16-bit floating point values.
      *
      * @param[in] window_input Input region on which to execute the kernel.
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
index 0a3fc44881..08b4e11189 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
@@ -40,7 +40,7 @@ public:
     }
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32
+     * @param[in]  input  Source tensor. Data type supported: U8/S8/U16/S16/QASYMM8/U32/S32/F16/F32
      * @param[out] output Destination tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index c30a4cd23d..25c3196e34 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -43,13 +43,13 @@ public:
     NELogits1DMaxKernel();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
      *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in] output Destination tensor. Data types supported: same as @p input
      *
      * @return a status
@@ -90,7 +90,7 @@ public:
     ~NELogits1DSoftmaxKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in]  max    Max values tensor. Same shape as input with dimension 0 set to 1.
      *                    Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
@@ -101,7 +101,7 @@ public:
     void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp);
     /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
      * @param[in] max    Max values tensor info. Same shape as input with dimension 0 set to 1.
      *                   Data types supported: same as @p input.
      * @param[in] output Destination tensor info. Data types supported: same as @p input.
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
index dc7ef8ff7a..76823acfa1 100644
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h
@@ -57,13 +57,13 @@ public:
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel
      *
-     * @param[in] input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output Output tensor. Data type supported: Same as @p input
      *
      * @return a status
@@ -76,7 +76,7 @@ public:
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index 1a7525bfc7..21f36f6c2b 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -75,7 +75,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F32
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32
      * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -85,7 +85,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
      *
      * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F16/F32
+     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QASYMM8/F16/F32
      * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index fee206638b..fd0c0f0c34 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -45,13 +45,11 @@ inline float32x4x3_t load_matrix_row(const float *ptr)
 }
 
 template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
 
 template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     const float32x4x3_t vtop =
     {
         {
@@ -108,9 +106,9 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 {
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -118,9 +116,9 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 {
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 908fa13876..d56fd44700 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -55,29 +55,6 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
     return r;
 }
 
-/** Loads a 3x3 matrix as a row  (qint8_t).
- *
- * @param[in] ptr            Pointer to a qint8 3x3 matrix.
- * @param[in] weights_offset (Optional) Weights quantization offset.
- *
- * @return The loaded matrix.
- */
-inline qint8x8x3_t load_matrix_row(const qint8_t *ptr, int weights_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(weights_offset);
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const qint8x8x3_t r =
-    {
-        {
-            vld1_dup_qs8(ptr),
-            vld1_dup_qs8(1 + ptr),
-            vld1_dup_qs8(2 + ptr)
-        }
-    };
-    return r;
-}
-
 /** Loads a 3x3 matrix as a row  (uint8_t).
  *
  * @param[in] ptr            Pointer to a uint8_t 3x3 matrix.
@@ -104,27 +81,25 @@ inline int32x4x3_t load_matrix_row(const uint8_t *ptr, int weights_offset = 0)
 
 /** Perform a convolve3x3 on float32.
  *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset         (Optional) Input quantization offset.
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
 template <unsigned int stridex>
 float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low,
                            const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                           int fixed_point_position, int input_offset = 0);
+                           int input_offset = 0);
 
 template <>
 inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low,
                                      const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                     int fixed_point_position, int input_offset)
+                                     int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
     ARM_COMPUTE_UNUSED(input_offset);
 
     const float32x4x3_t vtop =
@@ -185,11 +160,11 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
 template <>
 inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low,
                                      const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                     int fixed_point_position, int input_offset)
+                                     int input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -199,145 +174,35 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
 template <>
 inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low,
                                      const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                     int fixed_point_position, int input_offset)
+                                     int input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }
 
-/** Perform a convolve3x3 on qint16.
- *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset         (Optional) Input quantization offset.
- *
- */
-template <unsigned int stridex>
-qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                          const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                          int fixed_point_position, int input_offset = 0);
-
-template <>
-inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                                    const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                                    int fixed_point_position, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    const qint8x8x3_t vtop =
-    {
-        {
-            vld1_qs8(in_top),
-            vld1_qs8(in_top + 8),
-            vld1_qs8(in_top + 16)
-        }
-    };
-    const qint8x8x3_t vmid =
-    {
-        {
-            vld1_qs8(in_mid),
-            vld1_qs8(in_mid + 8),
-            vld1_qs8(in_mid + 16)
-        }
-    };
-    const qint8x8x3_t vlow =
-    {
-        {
-            vld1_qs8(in_low),
-            vld1_qs8(in_low + 8),
-            vld1_qs8(in_low + 16)
-        }
-    };
-    qint16x8x2_t out =
-    {
-        {
-            vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
-            vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
-        }
-    };
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
-    return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                                    const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                                    int fixed_point_position, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
-    return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                                    const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                                    int fixed_point_position, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
-    return out;
-}
-
 /** Perform a convolve3x3 on uint8_t
  *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset         (Optional) Input quantization offset.
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
 template <unsigned int stridex>
 int32x4x2_t convolve_3x3(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
                          const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                         int fixed_point_position, int input_offset);
+                         int input_offset);
 
 template <>
 inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                   int fixed_point_position, int input_offset)
+                                   int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     const int32x4_t v_input_offset = vdupq_n_s32(input_offset);
 
     const uint8x8x2_t vtop =
@@ -427,11 +292,9 @@ inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid,
 template <>
 inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
                                    const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                   int fixed_point_position, int input_offset)
+                                   int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
-    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3);
@@ -441,10 +304,9 @@ inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid,
 template <>
 inline int32x4x2_t convolve_3x3<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
                                    const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                   int fixed_point_position, int input_offset)
+                                   int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1);
     return out;
 }
@@ -477,34 +339,6 @@ inline void store_results<3>(float *buffer, const float32x4x2_t &values)
     vst1_f32(buffer, vget_low_f32(values.val[0]));
 }
 
-/** Stores a qint16_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(qint16_t *buffer, const qint16x8x2_t &values);
-
-template <>
-inline void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, values.val[0]);
-    vst1q_qs16(buffer + 8, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1_qs16(buffer, vget_low_s16(values.val[0]));
-}
-
 /** Stores a uint32_t array into a memory location.
  *
  * @param[in] buffer Pointer to the memory location where the values will be stored.
@@ -557,25 +391,20 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr)
 
 /** Perform a convolve3x3 on float16.
  *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
+ * @param[in] in_top Pointer to the first row of the input.
+ * @param[in] in_mid Pointer to the second row of the input.
+ * @param[in] in_low Pointer to the third row of the input.
+ * @param[in] m0     First row of the filter.
+ * @param[in] m1     Second row of the filter.
+ * @param[in] m2     Third row of the filter.
  *
  */
 template <unsigned int stridex>
-float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                           int fixed_point_position);
+float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2);
 
 template <>
-inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
+inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     const float16x8x3_t vtop =
     {
         {
@@ -627,10 +456,9 @@ inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *i
 }
 
 template <>
-inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
+inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
 {
-    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
@@ -638,10 +466,9 @@ inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *i
 }
 
 template <>
-inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
+inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
 {
-    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
     return out;
 }
-- 
cgit v1.2.1