From f9b595adbdc3f6f51ffa2c1f2aa70d0262d0db2d Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Fri, 3 Jul 2020 13:34:52 +0100
Subject: COMPMID-3532: Align data type support between doxygen and
 implementation - NEON

Change-Id: I70662cfb43890873b706b3f22b348f5d8cdd63ca
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3506
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEON/kernels/NEArithmeticSubtractionKernel.h   | 12 +++++
 .../core/NEON/kernels/NEChannelExtractKernel.h     |  6 +--
 .../NEON/kernels/NEDequantizationLayerKernel.h     |  6 +--
 .../NEON/kernels/NEElementwiseOperationKernel.h    | 38 +++++++--------
 .../core/NEON/kernels/NEElementwiseUnaryKernel.h   |  4 +-
 .../core/NEON/kernels/NEGEMMLowpReductionKernel.h  | 10 ++--
 .../core/NEON/kernels/NEGaussian3x3Kernel.h        |  4 +-
 arm_compute/core/NEON/kernels/NEIm2ColKernel.h     |  4 +-
 .../core/NEON/kernels/NEMaxUnpoolingLayerKernel.h  | 11 ++---
 .../NEON/kernels/NEPixelWiseMultiplicationKernel.h | 56 +++++++++++++---------
 .../core/NEON/kernels/NEReductionOperationKernel.h |  4 +-
 .../core/NEON/kernels/NEUpsampleLayerKernel.h      |  4 +-
 .../core/NEON/kernels/NEWeightsReshapeKernel.h     |  4 +-
 .../NEON/functions/NEArithmeticSubtraction.h       | 24 ++++++++++
 arm_compute/runtime/NEON/functions/NECast.h        | 18 +++----
 .../runtime/NEON/functions/NEDequantizationLayer.h |  6 +--
 .../NEON/functions/NEDetectionPostProcessLayer.h   | 24 +++++-----
 .../NEON/functions/NEElementwiseOperations.h       |  4 +-
 .../NEON/functions/NEElementwiseUnaryLayer.h       |  4 +-
 .../NEON/functions/NEGEMMConvolutionLayer.h        | 16 +++----
 arm_compute/runtime/NEON/functions/NEIm2Col.h      |  4 +-
 .../NEON/functions/NEPixelWiseMultiplication.h     | 56 +++++++++++++---------
 .../runtime/NEON/functions/NEPoolingLayer.h        |  6 +--
 .../runtime/NEON/functions/NEReductionOperation.h  | 13 +++--
 arm_compute/runtime/NEON/functions/NEScale.h       |  8 ++--
 arm_compute/runtime/NEON/functions/NEUnstack.h     | 10 ++--
 .../NEON/kernels/NEAbsoluteDifferenceKernel.cpp    | 11 ++---
 src/core/NEON/kernels/NEActivationLayerKernel.cpp  | 12 ++---
 .../NEON/kernels/NEBatchConcatenateLayerKernel.cpp |  4 --
 .../NEON/kernels/NEBatchToSpaceLayerKernel.cpp     |  3 --
 .../NEDepthwiseConvolutionLayerNativeKernel.cpp    |  4 +-
 src/core/NEON/kernels/NEDilateKernel.cpp           | 14 +++---
 src/core/NEON/kernels/NEErodeKernel.cpp            | 14 +++---
 src/core/NEON/kernels/NEFlattenLayerKernel.cpp     | 11 ++---
 .../kernels/NEFuseBatchNormalizationKernel.cpp     |  1 -
 .../NEON/kernels/NEGEMMLowpReductionKernel.cpp     | 11 +----
 src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 15 ++----
 .../NEON/kernels/NEMaxUnpoolingLayerKernel.cpp     | 33 +++++--------
 .../kernels/NEPixelWiseMultiplicationKernel.cpp    | 11 +++--
 src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp    |  2 +-
 src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp  |  7 +--
 src/core/NEON/kernels/NEReverseKernel.cpp          | 34 +++----------
 src/core/NEON/kernels/NEWeightsReshapeKernel.cpp   | 15 ++----
 src/runtime/NEON/functions/NERNNLayer.cpp          |  1 +
 44 files changed, 269 insertions(+), 290 deletions(-)

diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index dfd08d9b06..4872edd90f 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -73,6 +73,18 @@ public:
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
      *
      * @note Convert policy cannot be WRAP if datatype is QASYMM8
      *
diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
index 8d62016fe5..d953ff33ed 100644
--- a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
+++ b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,6 @@
 #include "arm_compute/core/NEON/INESimpleKernel.h"
 #include "arm_compute/core/Types.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 class IMultiImage;
@@ -60,7 +58,7 @@ public:
      *
      * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
      * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Format supported: u8
+     * @param[out] output  Destination tensor. Format supported: U8
      */
     void configure(const ITensor *input, Channel channel, ITensor *output);
     /** Set the input and output of the kernel
diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
index 3792fb3bd7..2a85da28de 100644
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,13 +52,13 @@ public:
     ~NEDequantizationLayerKernel() = default;
     /** Set input, output tensors.
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayerKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data types supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
index b109ddd0f8..12d7fb50a9 100644
--- a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
@@ -59,10 +59,10 @@ public:
 
     /** Common signature for all the specialised arithmetic functions
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Dependent on subclass.
-     * @param[in] window Region on which to execute the kernel.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Dependent on subclass.
+     * @param[in]  window Region on which to execute the kernel.
      */
     using ElementwiseFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
 
@@ -100,10 +100,10 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in]  op     Arithmetic operation to be executed.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
      */
     void configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
@@ -131,9 +131,9 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
@@ -168,9 +168,9 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
-     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
      *
      * @return a Status
      */
@@ -189,10 +189,10 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
      *
-     * @param[in] op     Comparison operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     * @param[in]  op     Comparison operation to be executed.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: U8.
      */
     void configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
@@ -201,7 +201,7 @@ public:
      * @param[in] op     Comparison operation to be executed.
      * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     * @param[in] output Output tensor info. Data types supported: U8.
      *
      * @return a Status
      */
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
index 02c390b6ba..c63618c256 100644
--- a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
+++ b/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
@@ -60,7 +60,7 @@ public:
     /** Function to configure the @ref NEElementwiseUnaryKernel
      *
      * @param[in]  op     Arithmetic operation to be executed.
-     * @param[in]  input  First tensor input. Data types supported: F16/F32.
+     * @param[in]  input  First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
      * @param[out] output Output tensor. Data types supported: Same as @p input.
      */
     void configure(ElementWiseUnary op, const ITensor *input, ITensor *output);
@@ -68,7 +68,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
      *
      * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] input  First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
      * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a Status
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 1e472f5252..dcee3da2d5 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -49,7 +49,7 @@ public:
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
      * @param[in]  info   Kernel metadata:
      *                    - k            Number of matrix columns/rows depending on the type of reduction.
@@ -82,7 +82,7 @@ public:
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in]  info           Kernel metadata:
      *                            - k            (num_mtx_a_cols) Number of matrix A columns
@@ -93,7 +93,7 @@ public:
     void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
      *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in] info           Kernel metadata:
      *                           - k            (num_mtx_a_cols) Number of matrix A columns
@@ -131,7 +131,7 @@ public:
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in]  info           Kernel metadata:
      *                            - k            (num_mtx_b_rows) Number of matrix B rows.
@@ -142,7 +142,7 @@ public:
     void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
      *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in] info           Kernel metadata:
      *                           - k            (num_mtx_b_rows) Number of matrix B rows.
diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
index fa92eef1b7..099b226d2f 100644
--- a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ public:
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: S16
+     * @param[out] output           Destination tensor. Data type supported: same as @p input
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
     void configure(const ITensor *input, ITensor *output, bool border_undefined);
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 1c358b379d..97cdfb4958 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -79,7 +79,7 @@ public:
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                         while every optional dimension from 4 and above represent a batch of inputs.
      *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8 works only for has_bias = false
+     *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[out] output      The output tensor. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
@@ -94,7 +94,7 @@ public:
      * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                        while every optional dimension from 4 and above represent a batch of inputs.
      *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8 works only for has_bias = false
+     *                        Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[in] output      The output tensor. Data types supported: Same as @p input
      * @param[in] kernel_dims The kernel dimensions (width and height).
      * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
diff --git a/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index 269317b6c1..7160d5d328 100644
--- a/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -54,7 +54,7 @@ public:
      *
      * @note Output shape must be equal to the shape of the original input to pool.
      *
-     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] indices   The indices of the maximal values. Data type supported: U32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
@@ -62,9 +62,9 @@ public:
     void configure(const ITensor *input, const ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEMaxUnpoolingLayerKernel
      *
-     * @param[in] input     Source tensor. Data types supported: F16/F32.
-     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in] indices   The indices of the maximal values. Data type supported: U32.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] indices   Tensor info of the indices of the maximal values. Data type supported: U32.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
      * @return a status
@@ -89,9 +89,6 @@ private:
     const ITensor    *_input;
     ITensor          *_output;
     const ITensor    *_indices;
-    PoolingLayerInfo  _pool_info;
-    DataLayout        _data_layout;
-    unsigned int      _num_elems_processed_per_iteration;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 5483fae565..2263e480a9 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -52,21 +52,27 @@ public:
     /** Default destructor */
     ~NEPixelWiseMultiplicationKernel() = default;
     /** Initialise the kernel's input, output and border mode.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[out] output          Output tensor. Data types supported:
-     *                             - U8, only if both inputs are U8.
-     *                             - QASYMM8, only if both inputs are QASYMM8.
-     *                             - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                             - S16.
-     *                             - QSYMM16, only if both inputs are QSYMM16.
-     *                             - S32, only if both inputs are QSYMM16.
-     *                             - F16, only if @p input1 is F16.
-     *                             - F32, only if both inputs are F32.
+     * @param[in]  input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in]  input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[out] output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
@@ -74,21 +80,27 @@ public:
      */
     void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          Output tensor info. Data types supported:
-     *                            - U8, only if both inputs are U8.
-     *                            - QASYMM8, only if both inputs are QASYMM8.
-     *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                            - S16.
-     *                            - QSYMM16, only if both inputs are QSYMM16.
-     *                            - S32, only if both inputs are QSYMM16.
-     *                            - F16, only if @p input1 is F16.
-     *                            - F32, only if both inputs are F32.
+     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
diff --git a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
index 28cca4987b..523c812f7d 100644
--- a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
@@ -59,7 +59,7 @@ public:
 
     /** Set the source, destination of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
      * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
@@ -69,7 +69,7 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperationKernel.
      *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
      * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
diff --git a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h b/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
index 9038eda9b2..5bd702aef6 100644
--- a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
@@ -52,7 +52,7 @@ public:
     ~NEUpsampleLayerKernel() = default;
     /** Set the input output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      * @param[in]  info   Contains stride information described in @ref Size2D.
      * @param[in]  policy Defines the policy to fill the intermediate pixels.
@@ -61,7 +61,7 @@ public:
     void configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEUpsampleLayerKernel
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output Destination tensor info. Data types supported: same as @p input.
      * @param[in] info   Contains stride information described in @ref Size2D.
      * @param[in] policy Defines the policy to fill the intermediate pixels.
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index b68cb50c7b..c6e4053293 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -76,7 +76,7 @@ public:
      *
      * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
      *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/FP16/F32
+     *                    Data types supported: All
      * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -87,7 +87,7 @@ public:
      *
      * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
      *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared.
-     *                   Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32
+     *                   Data types supported: All
      * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 0bab911c1a..90a33d9c2f 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -46,6 +46,18 @@ class NEArithmeticSubtraction : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
      *
      * @param[in]  input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
      * @param[in]  input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
@@ -55,6 +67,18 @@ public:
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
      *
      * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
      * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index 55c21a01ec..a124452c72 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -44,23 +44,23 @@ public:
      * Valid conversions Input -> Output :
      *
      *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8 -> U16, S16, S32, F32, F16
-     *   - U8 -> U16, S16, S32, F32, F16
-     *   - U16 -> U8, U32
-     *   - S16 -> QASYMM8_SIGNED, U8, S32
-     *   - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32 -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8
+     *   - QASYMM8        -> U16, S16, S32, F32, F16
+     *   - U8             -> U16, S16, S32, F32, F16
+     *   - U16            -> U8, U32
+     *   - S16            -> QASYMM8_SIGNED, U8, S32
+     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
+     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
+     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
      *
      * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32.
-     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32.
      * @param[in]  policy Conversion policy.
      */
     void configure(ITensor *input, ITensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NECast
      *
      * @param[in] input  Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32.
-     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32.
      * @param[in] policy Conversion policy.
      *
      * @return a status
diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index f8d0ce8b2d..4ac8e72fda 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,13 +39,13 @@ class NEDequantizationLayer : public INESimpleFunctionNoBorder
 public:
     /** Configure the kernel.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayer
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data type supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index d616762a5a..10dddc7afb 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,9 +55,9 @@ public:
     NEDetectionPostProcessLayer &operator=(const NEDetectionPostProcessLayer &) = delete;
     /** Configure the detection output layer NE function
      *
-     * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: F32, QASYMM8.
-     * @param[in]  input_score        The class prediction input tensor. Data types supported: Same as @p input_box_encoding.
-     * @param[in]  input_anchors      The anchors input tensor. Data types supported: Same as @p input_box_encoding.
+     * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
+     * @param[in]  input_score        The class prediction input tensor. Data types supported: same as @p input_box_encoding.
+     * @param[in]  input_anchors      The anchors input tensor. Data types supported: same as @p input_box_encoding.
      * @param[out] output_boxes       The boxes output tensor. Data types supported: F32.
      * @param[out] output_classes     The classes output tensor. Data types supported: Same as @p output_boxes.
      * @param[out] output_scores      The scores output tensor. Data types supported: Same as @p output_boxes.
@@ -70,14 +70,14 @@ public:
                    ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer
      *
-     * @param[in]  input_box_encoding The bounding box input tensor info. Data types supported: F32, QASYMM8.
-     * @param[in]  input_class_score  The class prediction input tensor info. Data types supported: F32, QASYMM8.
-     * @param[in]  input_anchors      The anchors input tensor. Data types supported: F32, QASYMM8.
-     * @param[out] output_boxes       The output tensor. Data types supported: F32.
-     * @param[out] output_classes     The output tensor. Data types supported: Same as @p output_boxes.
-     * @param[out] output_scores      The output tensor. Data types supported: Same as @p output_boxes.
-     * @param[out] num_detection      The number of output detection. Data types supported: Same as @p output_boxes.
-     * @param[in]  info               (Optional) DetectionPostProcessLayerInfo information.
+     * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
+     * @param[in] input_class_score  The class prediction input tensor info. Data types supported: same as @p input_box_encoding.
+     * @param[in] input_anchors      The anchors input tensor info. Data types supported: same as @p input_box_encoding.
+     * @param[in] output_boxes       The output tensor info. Data types supported: F32.
+     * @param[in] output_classes     The output tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] output_scores      The output tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] num_detection      The number of output detection tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] info               (Optional) DetectionPostProcessLayerInfo information.
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index 08f798ec6e..d24f975052 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -292,7 +292,7 @@ public:
      *
      * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor. Data types supported: U16/U32.
+     * @param[out]     output Output tensor. Data types supported: U8.
      * @param[in]      op     Comparison Operation to be performed.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op);
@@ -300,7 +300,7 @@ public:
      *
      * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     * @param[in] output Output tensor info. Data types supported: U8.
      * @param[in] op     Comparison Operation to be performed.
      *
      * @return a status
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
index 1fd24887a5..762329d833 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
@@ -96,13 +96,13 @@ class NELogLayer : public INESimpleFunctionNoBorder
 public:
     /** Initialize the function
      *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
+     * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NELogLayer
      *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
      * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a status
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e7da1006e0..8bff3ba431 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -64,21 +64,21 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                     Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     *                     Data type supported: All.
      * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                     Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
-     * @param[out] output  Destination tensor.
-     *                     Data types supported: Same as @p weights, FP32 if @p weights is BFLOAT16
+     *                     Data type supported: same as @p weights.
+     *                     @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] output  Destination tensor. Data types supported: same as @p weights.
      */
     void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
      *
      * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     *                    Data type supported: All.
      * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                    Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
-     * @param[in] output  Destination tensor.
-     *                    Data types supported: Same as @p weights FP32 if @p weights is BFLOAT16
+     *                    Data type supported: same as @p weights.
+     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[in] output  Destination tensor. Data types supported: same as @p weights.
      *
      * @return an error status
      */
diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h
index cb905a3652..79abcdb1e2 100644
--- a/arm_compute/runtime/NEON/functions/NEIm2Col.h
+++ b/arm_compute/runtime/NEON/functions/NEIm2Col.h
@@ -46,7 +46,7 @@ public:
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                         while every optional dimension from 4 and above represent a batch of inputs.
      *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8 works only for has_bias = false
+     *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[out] output      The output tensor. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
@@ -61,7 +61,7 @@ public:
      * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                        while every optional dimension from 4 and above represent a batch of inputs.
      *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8 works only for has_bias = false
+     *                        Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[in] output      The output tensor. Data types supported: Same as @p input
      * @param[in] kernel_dims The kernel dimensions (width and height).
      * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 3b1209356a..4b32f964fe 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -39,23 +39,29 @@ class NEPixelWiseMultiplication : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in, out] input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in, out] input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported:
-     *                                 - U8, only if both inputs are U8.
-     *                                 - QASYMM8, only if both inputs are QASYMM8.
-     *                                 - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                                 - S16.
-     *                                 - QSYMM16, only if both inputs are QSYMM16.
-     *                                 - S32, only if both inputs are QSYMM16.
-     *                                 - F16, only if @p input1 is F16.
-     *                                 - F32, only if both inputs are F32.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
@@ -65,21 +71,27 @@ public:
     void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          Output tensor info. Data types supported:
-     *                            - U8, only if both inputs are U8.
-     *                            - QASYMM8, only if both inputs are QASYMM8.
-     *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                            - S16.
-     *                            - QSYMM16, only if both inputs are QSYMM16.
-     *                            - S32, only if both inputs are QSYMM16.
-     *                            - F16, only if @p input1 is F16.
-     *                            - F32, only if both inputs are F32.
+     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index e43741c95b..8d018b77a4 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -58,10 +58,10 @@ public:
      *
      * @note F16 is supported for pool sizes 2 and 3 only
      *
-     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] input     Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * @param[in] indices   (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 78e8b04dbb..a7c6e238c4 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -28,7 +28,6 @@
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -49,17 +48,17 @@ public:
     NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[out] output    Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]  axis      Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in]  op        Reduction operation to perform.
-     * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
+     * @param[in, out] input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[out]     output    Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      axis      Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]      op        Reduction operation to perform.
+     * @param[in]      keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
     void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation.
      *
-     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
      * @param[in] output    Destination tensor info. Data types and data layouts supported: same as @p input.
      * @param[in] axis      Dimension along which to reduce. Supported reduction axis : 0
      * @param[in] op        Reduction operation to perform.
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 30bcdc168b..901001fd76 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -47,7 +47,7 @@ public:
     NEScale();
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in, out] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      policy                The interpolation type.
      * @param[in]      border_mode           Strategy to use for borders.
@@ -61,14 +61,14 @@ public:
                    SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input  Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      info   @ref ScaleKernelInfo to be used for configuration
      */
     void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScale
      *
-     * @param[in] input                 Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[in] output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] policy                The interpolation type.
      * @param[in] border_mode           Strategy to use for borders.
@@ -84,7 +84,7 @@ public:
                            PixelValue constant_border_value = PixelValue(), SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScale
      *
-     * @param[in] input  Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] info   @ref ScaleKernelInfo to be used for validation
      *
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index dbb04f08e5..6f8560b2bd 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,8 +47,8 @@ public:
     NEUnstack();
     /** Set the input, output and unstacking axis.
      *
-     * @param[in]     input         A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in,out] output_vector A vector of tensors. Data types supported: Same as @p input.
+     * @param[in]     input         A tensor to be unstacked. Data type supported: All.
+     * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input.
      *                              Note: The number of elements of the vector will be used as the number of slices to be taken from the axis.
      * @param[in]     axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
@@ -56,8 +56,8 @@ public:
     void configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis);
     /** Static function to check if given info will lead to a valid configuration of @ref NEUnstack
      *
-     * @param[in] input         Input tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-     * @param[in] output_vector Vector of output tensors' info. Data types supported: Same as @p input.
+     * @param[in] input         Input tensor info. Data type supported: All.
+     * @param[in] output_vector Vector of output tensors' info. Data types supported: same as @p input.
      * @param[in] axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      * @return a status
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index 62285e0578..28f30717ab 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,15 +32,9 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 void abs_diff_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -140,7 +134,7 @@ void NEAbsoluteDifferenceKernel::configure(const ITensor *input1, const ITensor
     {
         set_format_if_unknown(*output->info(), Format::S16);
     }
-    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    else if(input1->info()->data_type() == DataType::U8 || input2->info()->data_type() == DataType::U8)
     {
         set_format_if_unknown(*output->info(), Format::U8);
     }
@@ -210,3 +204,4 @@ void NEAbsoluteDifferenceKernel::run(const Window &window, const ThreadInfo &inf
 
     _func(_input1, _input2, _output, window);
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 43426dc122..1db9ff9e3f 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -27,28 +27,23 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/NEON/NESymm.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <arm_neon.h>
-#include <array>
-#include <cmath>
-#include <map>
 #include <set>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
 
     const static std::set<ActivationLayerInfo::ActivationFunction> qasymm8_supported_activations =
     {
@@ -874,3 +869,4 @@ void NEActivationLayerKernel::run_op(const InputTensorMap &inputs,
 
     (this->*_func)(inputs.at(TensorType::ACL_SRC), outputs.at(TensorType::ACL_DST), window);
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
index 5650b810d2..a5a574de63 100644
--- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
@@ -26,17 +26,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 namespace
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index c4c0f01f0f..4ad3dd76f9 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -25,12 +25,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include <arm_neon.h>
-#include <cstdint>
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index ef196ab904..a639a926ec 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -427,12 +427,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     if(is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
     if(biases != nullptr)
@@ -454,6 +453,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     {
         const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index e761815f9e..b4cc699c8f 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,16 +30,9 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 BorderSize NEDilateKernel::border_size() const
 {
     return BorderSize(1);
@@ -47,6 +40,10 @@ BorderSize NEDilateKernel::border_size() const
 
 void NEDilateKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
     _input  = input;
     _output = output;
 
@@ -126,3 +123,4 @@ void NEDilateKernel::run(const Window &window, const ThreadInfo &info)
     },
     in, out);
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 2a538ecd0f..edfcbb50c4 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,16 +30,9 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 BorderSize NEErodeKernel::border_size() const
 {
     return BorderSize(1);
@@ -47,6 +40,10 @@ BorderSize NEErodeKernel::border_size() const
 
 void NEErodeKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
     _input  = input;
     _output = output;
 
@@ -126,3 +123,4 @@ void NEErodeKernel::run(const Window &window, const ThreadInfo &info)
     },
     in, out);
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
index a48601f7b0..9dbf245c7a 100644
--- a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,17 @@
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
-#include <arm_neon.h>
-
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace misc::shape_calculator;
 
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -135,3 +133,4 @@ void NEFlattenLayerKernel::run(const Window &window, const ThreadInfo &info)
     }
     while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 6e7e5ab23f..b71630dba3 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -32,7 +32,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "utils/TypePrinter.h"
 #include <map>
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 1acdb1efce..a8a976cd6b 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -24,19 +24,10 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cstddef>
-#include <cstdint>
 
 namespace arm_compute
 {
@@ -45,7 +36,7 @@ namespace
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
     if(output->total_size() > 0)
     {
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 88104f7297..9080051e93 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,16 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstring>
-
-using namespace arm_compute;
 
+namespace arm_compute
+{
 namespace
 {
 TensorShape get_output_shape(const ITensorInfo *input)
@@ -57,7 +50,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     if(output->total_size() != 0)
     {
@@ -192,3 +184,4 @@ void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
         }
     }
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
index 1967c553bd..1f65e3260b 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
@@ -23,16 +23,8 @@
  */
 #include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -48,6 +40,10 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     PoolingType         pool_type       = pool_info.pool_type;
@@ -56,10 +52,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const int    pool_size_x = pool_info.pool_size.width;
     const int    pool_size_y = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
     if(output->total_size() != 0)
     {
@@ -72,20 +66,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 } // namespace
 
 NEMaxUnpoolingLayerKernel::NEMaxUnpoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr)
 {
 }
 
 void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    const Size2D pool_size(pool_info.pool_size.width, pool_info.pool_size.height);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
-    _input       = input;
-    _output      = output;
-    _indices     = indices;
-    _pool_info   = pool_info;
-    _data_layout = input->info()->data_layout();
+
+    _input   = input;
+    _output  = output;
+    _indices = indices;
+
     switch(input->info()->data_type())
     {
         case DataType::F32:
@@ -107,8 +100,8 @@ void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *i
     }
     const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-    _num_elems_processed_per_iteration = 1;
-    auto window                        = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+
+    auto window = calculate_max_window(*input->info(), Steps());
     INEKernel::configure(window);
 }
 template <typename T>
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index cd1c4b28cc..4b2352f4c2 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -62,17 +62,18 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
 
     if(output->total_size() > 0)
     {
-        if(is_data_type_quantized(output->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-        }
-
         const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                         "Output can only be U8 if both inputs are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
+                                        "Output can only be QASYMM8 if both inputs are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
+                                        "Output can only be QASYMM8 if both inputs are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
+                                        "Output can only be QSYMM16 if both inputs are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
                                         "Output can only be S32 if both inputs are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index d830d0db67..a1180d5e61 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -29,7 +29,6 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstdint>
 
 namespace arm_compute
 {
@@ -68,6 +67,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     if(output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
     }
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 00c3f98334..5cf2bd288c 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -23,18 +23,13 @@
  */
 #include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "support/ToolchainSupport.h"
 
 #include <cfloat>
-#include <cmath>
 
 namespace arm_compute
 {
@@ -53,7 +48,7 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi
     ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
     ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
     ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
     if(output->info()->total_size() != 0)
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 5a8c446ddd..8c3c59559f 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -23,24 +23,11 @@
  */
 #include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include <arm_neon.h>
-#include <array>
-#include <cmath>
-#include <map>
-
 namespace arm_compute
 {
 namespace
@@ -48,7 +35,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
@@ -159,28 +146,19 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_input->info()->data_type())
+    switch(_input->info()->element_size())
     {
-        case DataType::F32:
-        case DataType::U32:
-        case DataType::S32:
+        case 4:
             run_reverse<uint32_t>(window, _input, _axis, _output);
             break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::S16:
-        case DataType::U16:
+        case 2:
             run_reverse<uint16_t>(window, _input, _axis, _output);
             break;
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::U8:
-        case DataType::S8:
+        case 1:
             run_reverse<uint8_t>(window, _input, _axis, _output);
             break;
         default:
-            ARM_COMPUTE_ERROR("Data type not supported");
+            ARM_COMPUTE_ERROR("Element size not supported");
     }
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index d376d53081..f271f57f19 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -23,15 +23,11 @@
  */
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 
-#include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
@@ -48,11 +44,9 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
-                                                         DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     if(biases != nullptr)
     {
@@ -179,3 +173,4 @@ void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
     },
     in);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 19b84e7fb8..5385192f16 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -43,6 +43,7 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
                             const ITensorInfo *output, const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
     const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-- 
cgit v1.2.1