From cbbed288a71f2f048123db3cf396361e5d66ce93 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Fri, 20 Dec 2019 13:26:08 +0000
Subject: COMPMID-2991: Add support for QASYMM8_SIGNED in CL kernels/functions
 - part 2

Adding support for QASYMM8_SIGNED to the following CL kernels/functions:

- CLActivationLayerKernel/CLActivationLayer
- CLComparisonKernel/CLComparison
- CLConvertFullyConnectedWeightsKernel/CLConvertFullyConnectedWeights
- CLDeconvolutionLayerUpsampleKernel/CLDeconvolutionLayerUpsample
- CLDepthToSpaceLayerKernel/CLDepthToSpaceLayer
- CLDequantizationLayerKernel/CLDequantizationLayer
- CLGEMMMatrixVectorMultiplyKernel
- CLNormalizePlanarYUVLayerKernel
- CLPReluLayer
- CLPixelWiseMultiplicationKernel/CLPixelWiseMultiplication
- CLPoolingLayerKernel/CLPoolingLayer

Change-Id: I874bbb7c2b08baa9c5ff4c9e6bc8778b42a6bec5
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2539
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../core/CL/kernels/CLActivationLayerKernel.h      |  6 +-
 arm_compute/core/CL/kernels/CLComparisonKernel.h   |  6 +-
 .../kernels/CLConvertFullyConnectedWeightsKernel.h |  6 +-
 .../kernels/CLDeconvolutionLayerUpsampleKernel.h   |  4 +-
 .../core/CL/kernels/CLDepthToSpaceLayerKernel.h    |  6 +-
 .../core/CL/kernels/CLDequantizationLayerKernel.h  |  6 +-
 .../CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h  | 14 ++--
 .../CL/kernels/CLNormalizePlanarYUVLayerKernel.h   |  6 +-
 .../CL/kernels/CLPixelWiseMultiplicationKernel.h   |  6 +-
 arm_compute/core/CL/kernels/CLPoolingLayerKernel.h |  6 +-
 .../runtime/CL/functions/CLActivationLayer.h       |  6 +-
 arm_compute/runtime/CL/functions/CLComparison.h    |  6 +-
 .../CL/functions/CLConvertFullyConnectedWeights.h  |  8 +-
 .../CL/functions/CLDeconvolutionLayerUpsample.h    |  6 +-
 .../runtime/CL/functions/CLDepthToSpaceLayer.h     | 20 ++---
 .../runtime/CL/functions/CLDequantizationLayer.h   |  6 +-
 .../CL/functions/CLNormalizePlanarYUVLayer.h       | 22 ++---
 arm_compute/runtime/CL/functions/CLPReluLayer.h    |  6 +-
 .../CL/functions/CLPixelWiseMultiplication.h       |  6 +-
 arm_compute/runtime/CL/functions/CLPoolingLayer.h  |  6 +-
 src/core/CL/cl_kernels/comparisons.cl              | 11 +--
 src/core/CL/cl_kernels/convert_fc_weights.cl       |  4 +-
 src/core/CL/cl_kernels/deconvolution_layer.cl      |  4 +-
 src/core/CL/cl_kernels/depth_to_space.cl           |  6 +-
 src/core/CL/cl_kernels/dequantization_layer.cl     |  4 +-
 src/core/CL/cl_kernels/gemv.cl                     | 24 +++---
 .../normalize_planar_yuv_layer_quantized.cl        |  6 +-
 src/core/CL/cl_kernels/pixelwise_mul_int.cl        |  8 +-
 src/core/CL/cl_kernels/pooling_layer.cl            | 18 +---
 src/core/CL/cl_kernels/pooling_layer_quantized.cl  | 67 ++++++++-------
 src/core/CL/kernels/CLActivationLayerKernel.cpp    | 95 +++++++++++++---------
 src/core/CL/kernels/CLComparisonKernel.cpp         | 11 +--
 .../CLConvertFullyConnectedWeightsKernel.cpp       | 13 ++-
 .../kernels/CLDeconvolutionLayerUpsampleKernel.cpp |  9 +-
 src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp  |  5 +-
 .../CL/kernels/CLDequantizationLayerKernel.cpp     |  4 +-
 .../kernels/CLGEMMMatrixVectorMultiplyKernel.cpp   |  6 +-
 .../CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp | 17 ++--
 .../CL/kernels/CLPixelWiseMultiplicationKernel.cpp | 29 +++++--
 src/core/CL/kernels/CLPoolingLayerKernel.cpp       | 39 +++++----
 .../functions/CLConvertFullyConnectedWeights.cpp   |  9 +-
 src/runtime/CL/functions/CLDepthToSpaceLayer.cpp   | 27 ++----
 .../CL/functions/CLNormalizePlanarYUVLayer.cpp     | 25 ++----
 src/runtime/CL/functions/CLPoolingLayer.cpp        | 26 ++++--
 tests/datasets/DatatypeDataset.h                   |  3 +-
 tests/validation/CL/ActivationLayer.cpp            | 14 ++--
 tests/validation/CL/NormalizePlanarYUVLayer.cpp    | 28 +++++--
 tests/validation/CL/PReluLayer.cpp                 | 24 +++++-
 tests/validation/CL/PixelWiseMultiplication.cpp    | 12 ++-
 tests/validation/CL/PoolingLayer.cpp               | 22 +++--
 .../reference/NormalizePlanarYUVLayer.cpp          | 13 ++-
 tests/validation/reference/PoolingLayer.cpp        | 11 ++-
 tests/validation/reference/UpsampleLayer.cpp       | 33 +-------
 53 files changed, 413 insertions(+), 372 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
index cb2fda2be8..5b65a54824 100644
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@ public:
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                          of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[out]     output   Destination tensor. Data type supported: same as @p input
      * @param[in]      act_info Activation layer information.
      */
@@ -59,7 +59,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[in] output   Destination tensor info. Data type supported: same as @p input
      * @param[in] act_info Activation layer information.
      *
diff --git a/arm_compute/core/CL/kernels/CLComparisonKernel.h b/arm_compute/core/CL/kernels/CLComparisonKernel.h
index 21c6aeb064..a9c463901d 100644
--- a/arm_compute/core/CL/kernels/CLComparisonKernel.h
+++ b/arm_compute/core/CL/kernels/CLComparisonKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ public:
     ~CLComparisonKernel() = default;
     /** Set the inputs and output tensors
      *
-     * @param[in]  input1    Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input1    Source tensor. Data types supported: All.
      * @param[in]  input2    Source tensor. Data types supported: Same as @p input1.
      * @param[out] output    Destination tensor. Data types supported: U8.
      * @param[in]  operation Comparison operation to use.
@@ -58,7 +58,7 @@ public:
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
      *
-     * @param[in] input1    Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input1    Source tensor. Data types supported: All.
      * @param[in] input2    Source tensor. Data types supported: Same as @p input1.
      * @param[in] output    Destination tensor. Data types supported: U8.
      * @param[in] operation Comparison operation to use.
diff --git a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
index 6518dfc84c..b204eaa2ac 100644
--- a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@ public:
     ~CLConvertFullyConnectedWeightsKernel() = default;
     /** Set the input and output tensor.
      *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
@@ -63,7 +63,7 @@ public:
     void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel
      *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
      * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index 8382f3b4d7..a1c6bbdafe 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -50,14 +50,14 @@ public:
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: All.
      * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info   Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: All.
      * @param[in] output Destination tensor info. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] info   Contains padding and stride information described in @ref PadStrideInfo.
      *
diff --git a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 6ae991c8e7..637e5fa960 100644
--- a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,14 +49,14 @@ public:
     ~CLDepthToSpaceLayerKernel() = default;
     /** Initialise the kernel's inputs and output.
      *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape Block shape value.
      */
     void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
      *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
      * @param[in] output      Tensor output info. Data types supported: same as @p input
      * @param[in] block_shape Block shape value.
      *
diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
index d79cd89883..78b5c14128 100644
--- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,13 +48,13 @@ public:
     ~CLDequantizationLayerKernel() = default;
     /** Set the input, output, min and max.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor. Data types supported: F16/F32.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data types supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
index 821c6ae5bb..8ee911dc0e 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,16 +46,16 @@ public:
     CLGEMMMatrixVectorMultiplyKernel &operator=(CLGEMMMatrixVectorMultiplyKernel &&) = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input0 The reshaped input tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in]  input1 The 2D reshaped weights tensor. Data type supported: Same as @p input, S32 for QASYMM8 input.
-     * @param[out] output The output 2D tensor. Data types supported: Same as @p input
+     * @param[in]  input0 The reshaped input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in]  input1 The 2D reshaped weights tensor. Data type supported: Same as @p input.
+     * @param[out] output The output 2D tensor. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
      */
     void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixVectorMultiplyKernel
      *
-     * @param[in] input0 The reshaped input tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in] input1 The 2D reshaped weights tensor. Data type supported: Same as @p input, S32 for QASYMM8 input.
-     * @param[in] output The output 2D tensor. Data types supported: Same as @p input
+     * @param[in] input0 The reshaped input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] input1 The 2D reshaped weights tensor info. Data type supported: Same as @p input.
+     * @param[in] output The output 2D tensor info. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
      *
      * @return a status
      */
diff --git a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index d54aae31c5..4334882fd8 100644
--- a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/F16/F32.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output Destination tensor. Data type supported: same as @p input
      * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
      * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
@@ -60,7 +60,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/F16/F32.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output Destination tensor info. Data type supported: same as @p input
      * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
      * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
index 0e5027b29a..58471ab299 100644
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@ public:
     CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
      * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
      * @param[in]  scale           Scale to apply after multiplication.
@@ -60,7 +60,7 @@ public:
                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
      *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
      * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
      * @param[in] scale           Scale to apply after multiplication.
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index 95acdf4b6c..4b3ee24333 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,14 +52,14 @@ public:
     /** Set the input and output tensors.
      *
      *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
      *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index f7781480bf..09f5d2bf58 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@ public:
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                          of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[out]     output   Destination tensor. Data type supported: same as @p input
      * @param[in]      act_info Activation layer parameters.
      */
@@ -65,7 +65,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[in] output   Destination tensor info. Data type supported: same as @p input
      * @param[in] act_info Activation layer information.
      *
diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index 7f0b22341f..85dbe7129d 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@ class CLComparison : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs and outputs.
      *
-     * @param[in]  input1    Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input1    Source tensor. Data types supported: All.
      *                       The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in]  input2    Source tensor. Data types supported: Same as @p input1.
      *                       The input2 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -48,7 +48,7 @@ public:
     void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparison
      *
-     * @param[in]  input1    Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input1    Source tensor. Data types supported: All.
      * @param[in]  input2    Source tensor. Data types supported: Same as @p input1.
      * @param[in]  output    Destination tensor. Data types supported: U8.
      * @param[out] operation Comparison operation to be used.
diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
index f0359ecc75..76a28ed6fe 100644
--- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@ class CLConvertFullyConnectedWeights : public ICLSimpleFunction
 public:
     /** Initialize the function.
      *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
@@ -49,7 +49,7 @@ public:
     void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeights
      *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
      * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
@@ -90,7 +90,7 @@ public:
     }
     /** Configures the @ref CLConvertFullyConnectedWeights function
      *
-     * @param[in] input                Source weights tensor info to convert.  Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input                Source weights tensor info to convert.  Data type supported: All.
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 6f015744bf..5a1009c79f 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,14 +59,14 @@ public:
 
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input  Source tensor. Data type supported: QASYMM8/F16/F32.
+     * @param[in, out] input  Source tensor. Data type supported: All.
      * @param[out]     output Destination tensor. Data type supported: same as @p input.
      * @param[in]      info   Contains padding and policies to be used in the deconvolution.
      */
     void configure(ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32.
+     * @param[in] input  Source tensor info. Data type supported: All.
      * @param[in] output Destination tensor info. Data type supported: same as @p input.
      * @param[in] info   Contains padding and policies to be used in the deconvolution.
      *
diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index ddee04a3dd..0c33ed34be 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,43 +24,33 @@
 #ifndef ARM_COMPUTE_CLDEPTHTOSPACELAYER_H
 #define ARM_COMPUTE_CLDEPTHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
 /** Basic function to run @ref CLDepthToSpaceLayerKernel. */
-class CLDepthToSpaceLayer : public IFunction
+class CLDepthToSpaceLayer : public ICLSimpleFunction
 {
 public:
-    /** Default constructor */
-    CLDepthToSpaceLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape Block shape value.
      */
     void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayer.
      *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
      * @param[in] output      Tensor output info. Data types supported: same as @p input
      * @param[in] block_shape Block shape value.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLDepthToSpaceLayerKernel _depth_to_space_kernel; /**< CLDepthToSpaceLayerKernel to run */
 };
 }
 #endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index 308349af02..48d6ba8435 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,13 +40,13 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.
-     *                    Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayer
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data type supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
index 4fe5a111b6..5fbfdd18b7 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
 #define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -37,15 +37,13 @@ class ICLTensor;
  *
  *  @note The function simulates a NormalizePlanarYUV layer.
  */
-class CLNormalizePlanarYUVLayer : public IFunction
+class CLNormalizePlanarYUVLayer : public ICLSimpleFunction
 {
 public:
-    /** Default constructor */
-    CLNormalizePlanarYUVLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/F16/F32.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output Destinationfeature tensor. Data type supported: same as @p input
      * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: Same as @p input
      * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
@@ -55,7 +53,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayer
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/F16/F32.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output Destination tensor info. Data type supported: same as @p input
      * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: Same as @p input
      * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
@@ -64,12 +62,6 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLNormalizePlanarYUVLayerKernel _norm_kernel; /**< NormalizePlanarYUV layer kernel to run */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h
index 42876cd714..7f8a41238c 100644
--- a/arm_compute/runtime/CL/functions/CLPReluLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,14 +42,14 @@ public:
      *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  alpha  PRelu layer parameters. Data types supported: same of @p input.
      * @param[out] output Destination tensor. Data type supported: same as @p input
      */
     void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPReluLayer
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] alpha  PRelu layer parameters. Data types supported: same of @p input.
      * @param[in] output Destination tensor info. Data type supported: same as @p input
      *
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index fd64d7b939..72b1587b02 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@ class CLPixelWiseMultiplication : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -52,7 +52,7 @@ public:
                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
      *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
      * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
      * @param[in] scale           Scale to apply after multiplication.
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 19acb7fb40..c78b558ac8 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,14 +43,14 @@ class CLPoolingLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
-     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/F16/F32.
+     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
      *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl
index 8824b136b2..a41b7e2966 100644
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ b/src/core/CL/cl_kernels/comparisons.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
  *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -93,12 +93,13 @@ __kernel void DEFINE_KERNEL(OP_NAME)(
 #if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
 /** This function compares two quantized tensors.
  *
+ * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
  * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
  * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
  * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
  * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
  *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All quantized data types.
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -133,8 +134,8 @@ __kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
     Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
-    int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
-    int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
+    int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
+    int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
 
     in_a = in_a - (int16)((int)OFFSET_IN1);
     in_b = in_b - (int16)((int)OFFSET_IN2);
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
index d47b733acd..db0873755e 100644
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ b/src/core/CL/cl_kernels/convert_fc_weights.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index ea2455c613..a9a6ac1947 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 /** This function applies upsample on an input image.
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/depth_to_space.cl
index 2ffd0a40e7..5c2e8a1d57 100644
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/src/core/CL/cl_kernels/depth_to_space.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,7 @@
  * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
  * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -72,7 +72,7 @@ __kernel void depth_to_space_nchw(
  * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
  * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index 7550b4ba76..add86e3f2e 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
  * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
  * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QSYMM8
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
index 811aa1b865..aabde4119f 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/gemv.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -110,12 +110,12 @@ __kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VEC
         }
     }
 }
-#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
 
-#if defined(SRC_WIDTH) && defined(SRC_HEIGHT)
 /** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
  *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
@@ -123,13 +123,13 @@ __kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VEC
  * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: S32
  * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
@@ -158,14 +158,14 @@ __kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
     // This kernel handle 4 rows in per thread so that it can reuse the weights
     for(int i = 0; i < SRC_WIDTH; i += 4)
     {
-        int4 w = convert_int4(vload4(0, (__global uchar *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
+        int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
 
         int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
 
-        int4 tmp0 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s0))) + (int4)input_offset;
-        int4 tmp1 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s1))) + (int4)input_offset;
-        int4 tmp2 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s2))) + (int4)input_offset;
-        int4 tmp3 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s3))) + (int4)input_offset;
+        int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
+        int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
+        int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
+        int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
 
         // Accumulate
         acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
@@ -197,4 +197,4 @@ __kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
         }
     }
 }
-#endif /* defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
+#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
index 925975d2ba..b2ba65f812 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@
  * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
  * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
  *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
@@ -102,7 +102,7 @@ __kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
  * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
  * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
  *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index 989316d661..d277c6c56f 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,7 +109,7 @@ __kernel void pixelwise_mul_int(
  * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -117,7 +117,7 @@ __kernel void pixelwise_mul_int(
  * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
  * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -125,7 +125,7 @@ __kernel void pixelwise_mul_int(
  * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index c8b5e07b47..207669e43e 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -391,28 +391,16 @@ __kernel void pooling_layer_optimized_3(
 
 #if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
 
-// Set the initial value for the pooling operation accordingly with the data type
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define INITIAL_VALUE 0
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#if FP16
-#define INITIAL_VALUE -HALF_MAX
-#else // FP16
-#define INITIAL_VALUE -FLT_MAX
-#endif // FP16
-
-#endif // POOL_AVG
-
 /** Performs a pooling function of pool size equal to N  (NCHW)
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note -DFP16 must be passed at compile time if half float data type is used
  * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
  * @note In case of average pooling the following information must be passed at compile time:
  *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -519,13 +507,13 @@ ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_siz
 /** Performs a pooling function of pool size equal to N (NHWC)
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
- * @note -DFP16 must be passed at compile time if half float data type is used
  * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
  * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
  * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  * @note In case of average pooling the following information must be passed at compile time:
  *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 2df22d736c..3a370eea93 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,19 @@
  */
 #include "helpers.h"
 
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-#define VEC_FLOAT(VEC_SIZE) \
-    VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
 #define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR(VEC_SIZE) VEC_DATA_TYPE(uchar, VEC_SIZE)
 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
 #define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
     {                                                                                                                                                                 \
         const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
         const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
-        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_UCHAR(VEC_SIZE));                                               \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE));                                                \
     }
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
@@ -74,8 +75,10 @@ int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -100,8 +103,8 @@ __kernel void pooling_layer_MxN_quantized_nchw(
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    int8 vdata = 0;
-    int  sdata = 0;
+    int8 vdata = INITIAL_VALUE;
+    int  sdata = INITIAL_VALUE;
 
     // Load data
     for(int y = 0; y < POOL_SIZE_Y; y++)
@@ -109,17 +112,18 @@ __kernel void pooling_layer_MxN_quantized_nchw(
         int x = 0;
         for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
         {
-            uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0));
-            int8 data0  = convert_int8(data);
-            vdata       = POOL_OP(vdata, data0);
+            VEC_TYPE(8)
+            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int8 data0 = convert_int8(data);
+            vdata      = POOL_OP(vdata, data0);
         }
 
         // Leftover
         for(; x < (int)POOL_SIZE_X; ++x)
         {
-            uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0));
-            int data0  = convert_int(data);
-            sdata      = POOL_OP(sdata, data0);
+            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int data0      = convert_int(data);
+            sdata          = POOL_OP(sdata, data0);
         }
     }
 
@@ -133,22 +137,22 @@ __kernel void pooling_layer_MxN_quantized_nchw(
     res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
 #endif /* defined(POOL_AVG) */
 
-    uchar result_u8 = convert_uchar(res);
+    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
 
-    const float result_f32   = convert_float(result_u8);
+    const float result_f32   = convert_float(result_q8);
     const float input_offset = (float)OFFSET_IN1;
     const float input_scale  = (float)SCALE_IN1;
     const float scale_out    = (float)SCALE_OUT;
     const float offset_out   = (float)OFFSET_OUT;
     const float in_f32       = (result_f32 - input_offset) * input_scale;
     const float out_f32      = in_f32 / scale_out + offset_out;
-    result_u8                = convert_uchar_sat(convert_int_rte(out_f32));
+    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
 
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
-    *(__global uchar *)output.ptr = result_u8;
+    *(__global DATA_TYPE *)output.ptr = result_q8;
 }
 
 int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
@@ -158,7 +162,7 @@ int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int u
 #if defined(DST_DEPTH)
     int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
 #else  /* defined(DST_DEPTH) */
-    int            start_y    = get_global_id(2) * stride_y - pad_y;
+    int       start_y    = get_global_id(2) * stride_y - pad_y;
 #endif /* defined(DST_DEPTH) */
 
     const int end_x = min(start_x + pool_size_x, upper_bound_w);
@@ -178,8 +182,9 @@ int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int u
  * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  * @note In case of average pooling the following information must be passed at compile time:
  *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -209,17 +214,17 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
     Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
     Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
 #else  /* defined(DST_DEPTH) */
-    Tensor3D       input      = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D       output     = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
 #endif /* defined(DST_DEPTH) */
 
-    int8 vdata = 0;
+    int8 vdata = INITIAL_VALUE;
 
     const int idx_width = get_global_id(1) * STRIDE_X;
 #if defined(DST_DEPTH)
     const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
 #else  /* defined(DST_DEPTH) */
-    const int      idx_height = get_global_id(2) * STRIDE_Y;
+    const int idx_height = get_global_id(2) * STRIDE_Y;
 #endif /* defined(DST_DEPTH) */
 
     for(int y = 0; y < POOL_SIZE_Y; ++y)
@@ -231,9 +236,11 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
             x1     = select(x1, PAD_X - idx_width - 1, y != y1);
 
 #if defined(DST_DEPTH)
-            uchar8 data = vload8(0, (__global uchar *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+            VEC_TYPE(8)
+            data = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
 #else  /* defined(DST_DEPTH) */
-            uchar8 data       = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+            VEC_TYPE(8)
+            data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
 #endif /* defined(DST_DEPTH) */
 
             int8 data0 = convert_int8(data);
@@ -246,11 +253,13 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
     vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
 #endif /* defined(POOL_AVG) */
 
-    uchar8 out_u8 = convert_uchar8(vdata);
+    VEC_TYPE(8)
+    out_q8 = CONVERT(vdata, VEC_TYPE(8));
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    REQUANTIZE(8, out_u8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_u8);
+    REQUANTIZE(8, out_q8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q8);
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
     // Store result
-    vstore8(out_u8, 0, (__global uchar *)output.ptr);
+    vstore8(out_q8, 0, (__global DATA_TYPE *)output.ptr);
 }
+#endif /* defined(DATA_TYPE) && defined(INITIAL_VALUE) */
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 5062fd1801..270eb78dcb 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,14 +39,14 @@
 #include <cmath>
 #include <set>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
 
     static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
     {
@@ -63,12 +63,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
                                     "For Quantized data type only tanh, logistic, relu and lower/upper bounded relu are supported");
 
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
 
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
 
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
     // Checks performed when output is configured
     if((output != nullptr) && (output->total_size() != 0))
     {
@@ -135,27 +138,11 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     const DataType     dt                                = input->info()->data_type();
     float              a_const                           = act_info.a();
     float              b_const                           = act_info.b();
-    int                a_const_int                       = 0;
-    int                b_const_int                       = 0;
 
     const ActivationLayerInfo::ActivationFunction f_act                       = act_info.activation();
     const bool                                    is_quantized                = is_data_type_quantized(dt);
     const bool                                    perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) || (f_act == ActivationLayerInfo::ActivationFunction::TANH);
 
-    // Create quantized version of constants a, b if needed
-    if(dt == DataType::QASYMM8)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        a_const_int                           = quantize_qasymm8(a_const, iq_info);
-        b_const_int                           = quantize_qasymm8(b_const, iq_info);
-    }
-    else if(dt == DataType::QSYMM16)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        a_const_int                           = quantize_qsymm16(a_const, iq_info);
-        b_const_int                           = quantize_qsymm16(b_const, iq_info);
-    }
-
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN");
@@ -164,28 +151,59 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
     build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
-    // Set A, B constants in build options
-    if(is_quantized && !perform_activation_in_float)
-    {
-        build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
-        build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
-    }
-    else
-    {
-        build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-        build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
-    }
+    std::string kernel_name = std::string("activation_layer");
 
     // Set quantization info build options
     if(is_quantized)
     {
         const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
 
+        if(!perform_activation_in_float)
+        {
+            int a_const_int = 0;
+            int b_const_int = 0;
+
+            // Create quantized version of constants a, b if needed
+            switch(dt)
+            {
+                case DataType::QASYMM8:
+                {
+                    a_const_int = quantize_qasymm8(a_const, iq_info);
+                    b_const_int = quantize_qasymm8(b_const, iq_info);
+                }
+                break;
+                case DataType::QASYMM8_SIGNED:
+                {
+                    a_const_int = quantize_qasymm8_signed(a_const, iq_info);
+                    b_const_int = quantize_qasymm8_signed(b_const, iq_info);
+                }
+                break;
+                case DataType::QSYMM16:
+                {
+                    a_const_int = quantize_qsymm16(a_const, iq_info);
+                    b_const_int = quantize_qsymm16(b_const, iq_info);
+                }
+                break;
+                default:
+                    break;
+            }
+            build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+            build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+        }
+        else
+        {
+            build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+            build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+        }
+
         // Quantized value of 0 corresponds to the offset o1
         build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
         build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
         build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
 
+        // Set correct kernel name
+        kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
+
         // Set scale and offset of the input and output if they have different quantization info
         if(output != nullptr)
         {
@@ -198,14 +216,14 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
             }
         }
     }
-
-    // Create kernel
-    std::string kernel_name = std::string("activation_layer");
-    if(is_quantized)
+    else
     {
-        kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
+        // Set A, B constants in build options for float types
+        build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+        build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
     }
 
+    // Create kernel
     _kernel = create_opencl_kernel(_ctx, kernel_name, build_opts);
     // Make sure _kernel is initialized before calling the parent's configure
     _input  = input;
@@ -254,3 +272,4 @@ void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index 5570ecfc79..afee429219 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,12 +52,7 @@ int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
 Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
     ARM_COMPUTE_RETURN_ERROR_ON(supported_comparison_ops.count(operation) == 0);
 
@@ -132,7 +127,7 @@ void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *inp
     build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
     build_opts.emplace("-DOP=" + operation_name);
     build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
-    if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
+    if(is_data_type_quantized(input1->info()->data_type()))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
index 81856769b2..7ec6841149 100644
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,7 +64,7 @@ void CLConvertFullyConnectedWeightsKernel::configure(const ICLTensor *input, ICL
 
     // Set build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
     build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
     build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
 
@@ -79,18 +79,15 @@ void CLConvertFullyConnectedWeightsKernel::configure(const ICLTensor *input, ICL
 Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
                                                       DataLayout data_layout)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index cd9552f149..ee392032ca 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -32,8 +32,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
     : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
 {
@@ -45,7 +45,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
@@ -82,7 +82,7 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTe
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
 
     constexpr unsigned int num_elems_processed_per_iteration = 1;
@@ -156,3 +156,4 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
             ARM_COMPUTE_ERROR("Unsupported data layout");
     }
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 49a5590231..f23f7ce542 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
 
@@ -81,7 +82,7 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 60659faaaf..f85cb7636a 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
 
     if(output->tensor_shape().total_size() > 0)
     {
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index 9e5d677e89..c158937839 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,11 +90,11 @@ void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const
     _output = output;
 
     // Check if is a quantized operation
-    bool is_quantized = is_data_type_quantized_asymmetric(_input0->info()->data_type());
+    const bool is_quantized = is_data_type_quantized_asymmetric(_input0->info()->data_type());
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option_if(!is_quantized, "-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input0->info()->dimension(0)));
     build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input0->info()->dimension(1)));
 
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index b255ba346f..220c2cd576 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,15 +35,15 @@
 
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
@@ -97,12 +97,8 @@ CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
 
 void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-
     // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
 
     _input  = input;
@@ -183,3 +179,4 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 50cdc9c7f4..6bdb1242a6 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,10 +50,18 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1,
+                                                         1,
+                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2,
+                                                         1,
+                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                         DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -63,12 +71,17 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     // Validate in case of configured output
     if(output->total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
+                                                             1,
+                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                             DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                         "Output can only be U8 if both inputs are U8");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
                                         "Output can only be QASYMM8 if both inputs are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
+                                        "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
                                         "Output can only be QSYMM16 if both inputs are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
@@ -99,6 +112,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITe
         {
             set_data_type_if_unknown(*output, DataType::QASYMM8);
         }
+        else if(input1->data_type() == DataType::QASYMM8_SIGNED)
+        {
+            set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
+        }
         else if(input1->data_type() == DataType::QSYMM16)
         {
             set_data_type_if_unknown(*output, DataType::QSYMM16);
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index e3f1114f21..2d75e5f969 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,8 @@
 #include <string>
 #include <tuple>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
@@ -57,19 +58,8 @@ void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo p
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    DataLayout data_layout = input->data_layout();
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-            break;
-        case DataLayout::NHWC:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
                                     "Unsupported combination of parameters!");
 
@@ -234,7 +224,25 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
 
-    build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
+    // Set the initial value for the pooling operation accordingly with the data type
+    if(pool_type == PoolingType::MAX)
+    {
+        if(is_data_type_quantized(data_type))
+        {
+            PixelValue type_min{};
+            std::tie(type_min, std::ignore) = get_min_max(data_type);
+            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
+        }
+        else
+        {
+            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+        }
+    }
+    else
+    {
+        // Pool AVG and Pool L2 initial value
+        build_opts.add_option("-DINITIAL_VALUE=0");
+    }
 
     const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision();
     const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
@@ -389,3 +397,4 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index c226e56aff..02927e83ad 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
                                                DataLayout data_layout)
 {
@@ -37,4 +37,5 @@ Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const
                                                 DataLayout data_layout)
 {
     return CLConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
-}
\ No newline at end of file
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 08aef92eae..1581dd9c19 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,33 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+#include <utility>
 
-CLDepthToSpaceLayer::CLDepthToSpaceLayer()
-    : _depth_to_space_kernel()
+namespace arm_compute
 {
-}
-
 void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
-    _depth_to_space_kernel.configure(input, output, block_shape);
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceLayerKernel>();
+    k->configure(input, output, block_shape);
+    _kernel = std::move(k);
 }
 
 Status CLDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
 {
     return CLDepthToSpaceLayerKernel::validate(input, output, block_shape);
 }
-
-void CLDepthToSpaceLayer::run()
-{
-    CLScheduler::get().enqueue(_depth_to_space_kernel, true);
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 11d70e31fb..c5de591f5c 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,22 +24,18 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
 
 namespace arm_compute
 {
-CLNormalizePlanarYUVLayer::CLNormalizePlanarYUVLayer()
-    : _norm_kernel()
-{
-}
-
 void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
 {
-    _norm_kernel.configure(input, output, mean, std);
+    auto k = arm_compute::support::cpp14::make_unique<CLNormalizePlanarYUVLayerKernel>();
+    k->configure(input, output, mean, std);
+    _kernel = std::move(k);
 }
 
 Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
@@ -47,9 +43,4 @@ Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITens
 {
     return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
 }
-
-void CLNormalizePlanarYUVLayer::run()
-{
-    CLScheduler::get().enqueue(_norm_kernel, true);
-}
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 086017a7fd..f3ea926ae7 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,8 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -40,12 +40,14 @@ void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const Poolin
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 
+    const DataType data_type = input->info()->data_type();
+
     // Configure border depending on operation required (quantize border in case of asymmetric data_type)
     BorderMode border_mode{};
     PixelValue pixel_value(0.f);
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+    if(is_data_type_quantized_asymmetric(data_type) && !pool_info.exclude_padding())
     {
-        pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
+        pixel_value = PixelValue(0, data_type, input->info()->quantization_info());
     }
     switch(input->info()->data_layout())
     {
@@ -54,9 +56,16 @@ void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const Poolin
             break;
         case DataLayout::NHWC:
             border_mode = BorderMode::CONSTANT;
-            if(PoolingType::MAX == pool_info.pool_type() && !is_data_type_quantized_asymmetric(input->info()->data_type()))
+            if(PoolingType::MAX == pool_info.pool_type())
             {
-                pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+                if(is_data_type_quantized(data_type))
+                {
+                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
+                }
+                else
+                {
+                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+                }
             }
             break;
         default:
@@ -71,4 +80,5 @@ void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const Poolin
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     return CLPoolingLayerKernel::validate(input, output, pool_info);
-}
\ No newline at end of file
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/datasets/DatatypeDataset.h b/tests/datasets/DatatypeDataset.h
index 72952e418e..cc79104ff1 100644
--- a/tests/datasets/DatatypeDataset.h
+++ b/tests/datasets/DatatypeDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,7 @@ public:
     {
         DataType::QSYMM8,
                  DataType::QASYMM8,
+                 DataType::QASYMM8_SIGNED,
                  DataType::QSYMM16,
     })
     {
diff --git a/tests/validation/CL/ActivationLayer.cpp b/tests/validation/CL/ActivationLayer.cpp
index a17ad9b269..8b12b0b28b 100644
--- a/tests/validation/CL/ActivationLayer.cpp
+++ b/tests/validation/CL/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -235,15 +235,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLActivationLayerQuantizedFixture<uint8_t>, fra
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance(_function, _data_type));
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLActivationLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), QuantizedActivationDataset),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::QASYMM8)),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) })))
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLActivationLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), QuantizedActivationDataset),
+                                                                                                                       framework::dataset::make("DataType",
+                                                                                                                               DataType::QASYMM8_SIGNED)),
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 10.0f) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance(_function, _data_type));
 }
-TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLActivationLayerQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), QuantizedActivationDataset),
                                                                                                                         framework::dataset::make("DataType",
diff --git a/tests/validation/CL/NormalizePlanarYUVLayer.cpp b/tests/validation/CL/NormalizePlanarYUVLayer.cpp
index 31e0625eed..54fff01915 100644
--- a/tests/validation/CL/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/CL/NormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@ FIXTURE_DATA_TEST_CASE(Random, CLNormalizePlanarYUVLayerFixture<half>, framework
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(Random, CLNormalizePlanarYUVLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::RandomNormalizePlanarYUVLayerDataset(),
@@ -133,8 +133,8 @@ FIXTURE_DATA_TEST_CASE(Random, CLNormalizePlanarYUVLayerFixture<float>, framewor
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
 
 template <typename T>
 using CLNormalizePlanarYUVLayerQuantizedFixture = NormalizePlanarYUVLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLNormalizePlanarYUVLayer, T>;
@@ -143,17 +143,27 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(Random, CLNormalizePlanarYUVLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::RandomNormalizePlanarYUVLayerDataset(),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, 0);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(Random, CLNormalizePlanarYUVLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::RandomNormalizePlanarYUVLayerDataset(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, 0);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // NormalizePlanarYUVLayer
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/PReluLayer.cpp b/tests/validation/CL/PReluLayer.cpp
index 32fb2a113b..ce678d9aea 100644
--- a/tests/validation/CL/PReluLayer.cpp
+++ b/tests/validation/CL/PReluLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,9 @@ const auto PReluLayerU8Dataset = combine(combine(framework::dataset::make("DataT
 const auto PReluLayerQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)),
                                               framework::dataset::make("DataType",
                                                                        DataType::QASYMM8));
+const auto PReluLayerQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                    framework::dataset::make("DataType",
+                                                                             DataType::QASYMM8_SIGNED));
 const auto PReluLayerS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
                                           framework::dataset::make("DataType", DataType::S16));
 const auto PReluLayerFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
@@ -165,7 +168,21 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerQuantizedFixture<uint8_t>, framewor
                       )
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.01);
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                  PReluLayerQASYMM8SIGNEDDataset),
+                                                                                                                  framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 127.f, 20) })),
+                                                                                                                  framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 127.f, 10) })),
+                                                                                                                  framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 127.f, 5) }))
+
+                      )
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END()
 TEST_SUITE_END()
@@ -211,8 +228,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerFixture<half>, framework::DatasetMo
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(), shape)
 {
     // Create tensors
     CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index 22ff9f2fb9..3b55e25f37 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -139,8 +139,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<uint8
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPixelWiseMultiplicationQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("Scale", { 1.f, 2.f })),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
@@ -151,7 +154,8 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLPixelWiseMultiplicationQuantizedFixture<uint8
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // QASYMM8_SIGNED
+
 TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        framework::dataset::make("DataType", DataType::QSYMM16)),
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index ff7c24f024..262cea3338 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,9 +71,10 @@ framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(5, 7) })),
 framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
 framework::dataset::make("ExcludePadding", { true }));
 
-constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
-constexpr AbsoluteTolerance<float>   tolerance_f16(0.01f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);  /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
+constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f);  /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float>   tolerance_f16(0.01f);   /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit signed asymmetric type */
 const auto                           pool_data_layout_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
 
 const auto pool_fp_mixed_precision_dataset = framework::dataset::make("FpMixedPrecision", { true, false });
@@ -188,14 +189,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framew
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8,
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                                   pool_data_layout_dataset))
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8Small,
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                    pool_data_layout_dataset))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_s);
 }
-TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // PoolingLayer
 TEST_SUITE_END() // CL
diff --git a/tests/validation/reference/NormalizePlanarYUVLayer.cpp b/tests/validation/reference/NormalizePlanarYUVLayer.cpp
index ea0e75a3c7..d2d29cc682 100644
--- a/tests/validation/reference/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/reference/NormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,6 +72,17 @@ SimpleTensor<uint8_t> normalize_planar_yuv_layer<uint8_t>(const SimpleTensor<uin
     return dst;
 }
 
+template <>
+SimpleTensor<int8_t> normalize_planar_yuv_layer<int8_t>(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &mean, const SimpleTensor<int8_t> &std)
+{
+    SimpleTensor<float>  src_tmp  = convert_from_asymmetric(src);
+    SimpleTensor<float>  mean_tmp = convert_from_asymmetric(mean);
+    SimpleTensor<float>  std_tmp  = convert_from_asymmetric(std);
+    SimpleTensor<float>  dst_tmp  = normalize_planar_yuv_layer<float>(src_tmp, mean_tmp, std_tmp);
+    SimpleTensor<int8_t> dst      = convert_to_asymmetric<int8_t>(dst_tmp, src.quantization_info());
+    return dst;
+}
+
 template SimpleTensor<half> normalize_planar_yuv_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &mean, const SimpleTensor<half> &std);
 template SimpleTensor<float> normalize_planar_yuv_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &mean, const SimpleTensor<float> &std);
 } // namespace reference
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 010412c92b..40dd6fa505 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -171,6 +171,15 @@ SimpleTensor<uint8_t> pooling_layer<uint8_t>(const SimpleTensor<uint8_t> &src, c
     return dst;
 }
 
+template <>
+SimpleTensor<int8_t> pooling_layer<int8_t>(const SimpleTensor<int8_t> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+{
+    SimpleTensor<float>  src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>  dst_tmp = pooling_layer_internal<float>(src_tmp, info, output_qinfo);
+    SimpleTensor<int8_t> dst     = convert_to_asymmetric<int8_t>(dst_tmp, output_qinfo);
+    return dst;
+}
+
 template <>
 SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
 {
diff --git a/tests/validation/reference/UpsampleLayer.cpp b/tests/validation/reference/UpsampleLayer.cpp
index d77f9ae348..a81a601057 100644
--- a/tests/validation/reference/UpsampleLayer.cpp
+++ b/tests/validation/reference/UpsampleLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "UpsampleLayer.h"
 
+#include "arm_compute/core/utils/misc/Requires.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -33,10 +34,8 @@ namespace validation
 {
 namespace reference
 {
-namespace
-{
 template <typename T>
-SimpleTensor<T> upsample_function(const SimpleTensor<T> &src, const Size2D &info, const InterpolationPolicy policy)
+SimpleTensor<T> upsample_layer(const SimpleTensor<T> &src, const Size2D &info, const InterpolationPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON(policy != InterpolationPolicy::NEAREST_NEIGHBOR);
     ARM_COMPUTE_UNUSED(policy);
@@ -76,36 +75,12 @@ SimpleTensor<T> upsample_function(const SimpleTensor<T> &src, const Size2D &info
     return out;
 }
 
-} // namespace
-
-template <typename T>
-SimpleTensor<T> upsample_layer(const SimpleTensor<T> &src, const Size2D &info, const InterpolationPolicy policy)
-{
-    return upsample_function<T>(src, info, policy);
-}
-
-template <>
-SimpleTensor<uint8_t> upsample_layer(const SimpleTensor<uint8_t> &src, const Size2D &info, const InterpolationPolicy policy)
-{
-    SimpleTensor<uint8_t> dst(src.shape(), src.data_type(), 1, src.quantization_info());
-
-    if(is_data_type_quantized_asymmetric(src.data_type()))
-    {
-        SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-        SimpleTensor<float> dst_tmp = upsample_function<float>(src_tmp, info, policy);
-        dst                         = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
-    }
-    else
-    {
-        dst = upsample_function<uint8_t>(src, info, policy);
-    }
-    return dst;
-}
-
 template SimpleTensor<float> upsample_layer(const SimpleTensor<float> &src,
                                             const Size2D &info, const InterpolationPolicy policy);
 template SimpleTensor<half> upsample_layer(const SimpleTensor<half> &src,
                                            const Size2D &info, const InterpolationPolicy policy);
+template SimpleTensor<uint8_t> upsample_layer(const SimpleTensor<uint8_t> &src,
+                                              const Size2D &info, const InterpolationPolicy policy);
 template SimpleTensor<int8_t> upsample_layer(const SimpleTensor<int8_t> &src,
                                              const Size2D &info, const InterpolationPolicy policy);
 } // namespace reference
-- 
cgit v1.2.1