COMPMID-3532: Align data type support between doxygen and implementation - CL

Also removes some unused code. Change-Id: I85687c40999c3cdf9e6fccfcd020b0901a9515fe Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3581 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2020-07-06 11:27:21 +0100
committer: Michele Di Giorgio <michele.digiorgio@arm.com> 2020-07-20 12:39:29 +0000
commit: f6f7876e9ee8b58a8a6b335b032d554412fa3983 (patch)
tree: 669c86bfc60ec99965151022a4112c53116b06c0 /src
parent: 954051f449788f22eb605b126f71af923950ca29 (diff)
download: ComputeLibrary-f6f7876e9ee8b58a8a6b335b032d554412fa3983.tar.gz
39 files changed, 168 insertions, 438 deletions
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index d18a319f96..895bb72827 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -218,11 +218,6 @@ std::string get_data_size_from_data_type(const DataType &dt)
     }
 }
 
-std::string get_underlying_cl_type_from_data_type(const DataType &dt)
-{
-    return get_cl_type_from_data_type(dt);
-}
-
 GPUTarget get_target_from_device(const cl::Device &device)
 {
     // Query device name size
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 72af73bc94..73874b69b2 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -203,7 +203,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gather", "gather.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
-    { "gemm_accumulate_biases", "gemm.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/activation_layer_quant.cl
index e304f7bb4c..0481319428 100644
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ b/src/core/CL/cl_kernels/activation_layer_quant.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
  * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
  * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -112,7 +112,7 @@ __kernel void activation_layer_quant_f32(
  * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
  * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 2bf603d5b9..9a87eb4af3 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl
@@ -1,26 +1,26 @@
 /*
-* Copyright (c) 2018 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
+* Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
@@ -47,7 +47,7 @@
  * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
  *       K is equal to num_channels / num_groups.
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
@@ -110,7 +110,7 @@ __kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
  * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
  *       K is equal to num_channels / num_groups.
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl
index 7d5d493f59..408846144d 100644
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ b/src/core/CL/cl_kernels/comparisons.cl
@@ -51,7 +51,7 @@
  * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
  * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -59,7 +59,7 @@
  * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -115,7 +115,7 @@ __kernel void DEFINE_KERNEL(OP_NAME)(
  * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 8adcc1b4da..4281e675d7 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -73,7 +73,7 @@ inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset,
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
  *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
  * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -155,7 +155,7 @@ __kernel void concatenate_width_x2(
  * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
  * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
  *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -388,7 +388,7 @@ __kernel void concatenate_height(
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 319798356a..0592e07511 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
  * -# -DDEPTH = The third dimension (depth) of the tensor (it is needed only if d == 3)
  * -# -DDATA_TYPE = Input and output datatypes.
  *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -80,7 +80,7 @@ __kernel void copy_pad_tensor(
 #if defined(DATA_TYPE)
 /** Performs a copy of input tensor to the output tensor.
  *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 59ae682ad8..e1f6505df7 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -692,7 +692,7 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
  * @attention Input's height and width should be 3
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 6225f17aed..d4bea4b2e8 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -144,7 +144,7 @@
 
 /** This function computes the depthwise convolution quantized.
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -152,7 +152,7 @@
  * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
@@ -160,7 +160,7 @@
  * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -461,9 +461,7 @@ __kernel void dwc_3x3_native_quantized8_nchw(
 #endif /*DILATION_X==1*/
 /** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
  *
- * @note Per-channel quantization is not supported by this kernel.
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -471,7 +469,7 @@ __kernel void dwc_3x3_native_quantized8_nchw(
  * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
@@ -479,7 +477,7 @@ __kernel void dwc_3x3_native_quantized8_nchw(
  * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -789,7 +787,7 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw(
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -809,7 +807,7 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw(
  * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
  * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -1028,7 +1026,7 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc(
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -1048,7 +1046,7 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc(
  * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
  * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -1378,7 +1376,7 @@ __kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc(
  * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
  *       If not, the quantization will be performed using a fixed point multiplication
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index 5b3266cc49..5775d899e8 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,7 @@
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -106,7 +106,7 @@ __kernel void fill_image_borders_replicate(
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index e3ce6bf0cd..2360561f8a 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -61,7 +61,7 @@
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
  *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
@@ -261,7 +261,7 @@ __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
  *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
@@ -412,7 +412,7 @@ __kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
  *                                      K0: 1,2,3,4,8,16
  *                                      H0: greater than 0
  *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
@@ -566,7 +566,7 @@ __kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
  *                                      K0: 2,3,4,8,16
  *                                      H0: greater than 0
  *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
@@ -7626,39 +7626,3 @@ __kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
     vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
 }
 #endif // defined(WIDTH_VECTOR_A)
-
-/** This kernel accumulates each row with the biases vector.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
- *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
-__kernel void gemm_accumulate_biases(
-    IMAGE_DECLARATION(accum),
-    VECTOR_DECLARATION(biases))
-{
-    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    // Vector size, e.g. number of vector elements.
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
-    accum_value  = biases_value + accum_value;
-    // Store result in the accumulate buffer
-    VSTORE(VECTOR_SIZE)
-    (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 0f4a86c3bc..aac8d5a1e2 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1115,7 +1115,7 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
  * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
  * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1180,7 +1180,7 @@ __kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
  * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
  * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1253,7 +1253,7 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
  * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
  * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
index e8bd1a5eff..bb46a49f84 100644
--- a/src/core/CL/cl_kernels/memset.cl
+++ b/src/core/CL/cl_kernels/memset.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
  * -# -DVEC_SIZE = Vector size
  * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
  *
- * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: All.
  * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl
index e618b0a269..ab13131807 100644
--- a/src/core/CL/cl_kernels/nonmax.cl
+++ b/src/core/CL/cl_kernels/nonmax.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,13 @@
 
 /** This function performs Non maxima suppression over a 3x3 window on a given image.
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/F32
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p scr_ptr
  * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/pad_layer.cl
index ff43b091fc..4e4d2ad9e7 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/pad_layer.cl
@@ -51,7 +51,7 @@
  *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
  *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -165,7 +165,7 @@ __kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
  * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
  *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index c5fdfd8751..b2e56928d0 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -167,11 +167,11 @@ __kernel void reduction_operation_x(
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note In case of MIN and MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8 for operation MEAN
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
@@ -233,13 +233,13 @@ __kernel void reduction_operation_non_parallel_x(
  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
@@ -316,7 +316,7 @@ __kernel void reduction_operation_y(
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -324,7 +324,7 @@ __kernel void reduction_operation_y(
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
@@ -420,7 +420,7 @@ __kernel void reduction_operation_z(
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
  * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -430,7 +430,7 @@ __kernel void reduction_operation_z(
  * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
  * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/reorg_layer.cl
index 6a181dcd94..29344de37a 100644
--- a/src/core/CL/cl_kernels/reorg_layer.cl
+++ b/src/core/CL/cl_kernels/reorg_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@
  * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
  * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -79,7 +79,7 @@ __kernel void reorg_layer_nchw(
  * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
  * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/reverse.cl
index 0cc6bc0ee2..10ffe84aeb 100644
--- a/src/core/CL/cl_kernels/reverse.cl
+++ b/src/core/CL/cl_kernels/reverse.cl
@@ -1,26 +1,26 @@
 /*
-* Copyright (c) 2018 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
+* Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
@@ -34,7 +34,7 @@
  * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
  * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
  *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/select.cl
index cb7988e7b0..52ef81560a 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/select.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
  * @param[in]  c_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  c_step_z                          c_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
  * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -46,7 +46,7 @@
  * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
  * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -54,7 +54,7 @@
  * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -98,7 +98,7 @@ __kernel void select_same_rank(
  * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
  * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -106,7 +106,7 @@ __kernel void select_same_rank(
  * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
  * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -114,7 +114,7 @@ __kernel void select_same_rank(
  * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -162,7 +162,7 @@ __kernel void select_different_rank_2(
  * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
  * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -170,7 +170,7 @@ __kernel void select_different_rank_2(
  * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
  * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -178,7 +178,7 @@ __kernel void select_different_rank_2(
  * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index 96e2b15a9c..f4c5c4b60e 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -96,7 +96,7 @@ int4 mult_by_quantized_multiplier_parallel(int4 data)
  * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
  * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -548,7 +548,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
  * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
  * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/stack_layer.cl
index 59dee1d2fb..438e858df2 100644
--- a/src/core/CL/cl_kernels/stack_layer.cl
+++ b/src/core/CL/cl_kernels/stack_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
  * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
  * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
index 51860b60a5..785be6c710 100644
--- a/src/core/CL/cl_kernels/transpose.cl
+++ b/src/core/CL/cl_kernels/transpose.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -132,7 +132,7 @@
  *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
  *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/upsample_layer.cl
index 0ce242e4df..d0cc0f24b7 100644
--- a/src/core/CL/cl_kernels/upsample_layer.cl
+++ b/src/core/CL/cl_kernels/upsample_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,13 @@
 /** This function applies upsample on an input image. (NCHW)
  *
  * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
  * -# -DVEC_SIZE_IN = Input vector size
  * -# -DVEC_SIZE_OUT = Output vector size
  * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
  * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -81,13 +81,13 @@ __kernel void upsample_layer_nchw(
 /** This function applies upsample on an input image. (NHWC)
  *
  * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
  * -# -DVEC_SIZE_IN = Input vector size
  * -# -DVEC_SIZE_OUT = Output vector size
  * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
  * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 9e49869e15..66751f7dd3 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -27,8 +27,6 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
@@ -37,7 +35,6 @@
 #include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 
-#include <cmath>
 #include <set>
 
 namespace arm_compute
@@ -47,7 +44,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
 
     static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
     {
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
index f0e869500e..9c2b68acc0 100644
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
@@ -27,20 +27,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 
 #include "support/StringSupport.h"
 
-#include <map>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
@@ -102,7 +97,7 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
     {
@@ -172,3 +167,4 @@ void CLBatchConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &
     }
     while(window.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 5306880035..10aa615c6f 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -27,20 +27,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 
 #include "support/StringSupport.h"
 
-#include <map>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
@@ -100,7 +95,7 @@ void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
     {
@@ -156,3 +151,4 @@ void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &
     }
     while(window.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index cd032bba77..67dac3280e 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -35,13 +35,8 @@
 #include "arm_compute/core/Window.h"
 #include "support/StringSupport.h"
 
-#include <cstdint>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLFillBorderKernel::CLFillBorderKernel()
     : ICLKernel(), _tensor(nullptr)
 {
@@ -85,7 +80,7 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT
 
     // Define build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
     build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top));
     build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom));
     build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left));
@@ -191,3 +186,4 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
deleted file mode 100644
index 30dee7fe21..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
-                                                        unsigned int &num_elems_processed_per_iteration)
-{
-    // Select the vector size to use (8 for Bifrost; 16 for Midgard).
-    bool is_gpu_bifrost = gpu_target_is_in(gpu_target,
-                                           GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                                           GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                                           GPUTarget::G52, GPUTarget::G52LIT);
-    num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     biases_access(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->dimension(1));
-    AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, biases_access, accum_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
-    : _accum(nullptr), _biases(nullptr)
-{
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), accum, biases);
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context, ICLTensor *accum, const ICLTensor *biases)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
-
-    _biases = biases;
-    _accum  = accum;
-
-    // Get the target gpu
-    GPUTarget    gpu_target  = get_target();
-    unsigned int vector_size = 0;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemm_accumulate_biases", build_opts.options());
-}
-
-Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target)
-{
-    unsigned int num_elems_processed_per_iteration = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get(), gpu_target, num_elems_processed_per_iteration).first);
-
-    return Status{};
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    Window accum_slice = window.first_slice_window_2D();
-
-    Window biases_slice(accum_slice);
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    // Run kernel
-    do
-    {
-        // Set arguments
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _accum, accum_slice);
-        add_1D_tensor_argument(idx, _biases, biases_slice);
-
-        enqueue(queue, *this, accum_slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(accum_slice));
-}
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index 02ed1a9df5..f52384593b 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -44,7 +44,7 @@ constexpr unsigned int num_rows_read_per_iteration  = 4;
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
     ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 9d8c42a50d..0da0d4ca1f 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -25,22 +25,16 @@
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr)
 {
@@ -52,9 +46,7 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
 
@@ -152,3 +144,4 @@ void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::Comma
     }
     while(window.slide_window_slice_2D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index eccb6c8b63..c05df61edf 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 718b7a299c..c283c440a3 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -27,10 +27,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
@@ -98,7 +95,7 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DAXIS=" + support::cpp11::to_string(axis));
     build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(input->info()->dimension(2)));
     build_opts.add_option("-DDST_DIM3=" + support::cpp11::to_string(output->info()->dimension(3)));
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index 27d4508e1d..101055001c 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -52,14 +52,19 @@ Status CLUpsampleLayerKernel::validate(const ITensorInfo *input, const ITensorIn
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
 
     return Status{};
 }
@@ -126,7 +131,7 @@ void CLUpsampleLayerKernel::configure(const CLCompileContext &compile_context, c
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_IN=" + support::cpp11::to_string(_num_elems_processed_per_iteration_input_x));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
     build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_IN=" + support::cpp11::to_string(std::max<int>(_input->info()->dimension(0) - _num_elems_processed_per_iteration_input_x, 0)));
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index 91db5b17c2..c87290d04c 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -28,11 +28,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_info.h"
@@ -110,7 +107,7 @@ void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index d8afd1db2d..84744a68aa 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -28,11 +28,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_info.h"
@@ -131,7 +127,7 @@ void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index db996ebe43..20cb962b7e 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -27,10 +27,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
@@ -38,8 +35,6 @@
 
 #include "support/StringSupport.h"
 
-#include <map>
-
 namespace arm_compute
 {
 namespace
@@ -106,7 +101,7 @@ void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index a36e7dba22..e30b1dbb86 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -28,9 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
-#include <cmath>
 #include <memory>
-#include <tuple>
 
 namespace arm_compute
 {
@@ -275,6 +273,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
                                         recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
                                         output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
 
     const int input_size  = input->dimension(0);
     const int batch_size  = input->dimension(1);
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 579c917d50..666e7b0786 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -29,9 +29,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <utility>
-
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -43,9 +42,13 @@ CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
 Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
                             const ITensorInfo *output, const ActivationLayerInfo &info)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, recurrent_weights, bias, hidden_state, output);
+
     const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1));
@@ -132,3 +135,4 @@ void CLRNNLayer::prepare()
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 4166c3230b..54e91fb8d8 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -24,22 +24,19 @@
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/Utils.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(),
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape(), _num_of_stages(), _reduction_axis(), _is_serial(),
       _is_reshape_required(false)
 {
 }
@@ -197,7 +194,6 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
 void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _op                  = op;
     _num_of_stages       = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis      = axis;
     _is_serial           = needs_serialized_reduction(op, input->info()->data_type(), axis);
@@ -259,65 +255,13 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
                 first_kernel_op        = ReductionOperation::MIN;
                 intermediate_kernel_op = ReductionOperation::MIN;
                 last_kernel_op         = ReductionOperation::MIN;
-                switch(input->info()->data_type())
-                {
-                    case DataType::F32:
-                    {
-                        pixelValue = PixelValue(std::numeric_limits<float>::max());
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        pixelValue = PixelValue(static_cast<half>(65504.0f));
-                        break;
-                    }
-                    case DataType::QASYMM8:
-                    {
-                        pixelValue = std::get<1>(get_min_max(input->info()->data_type()));
-                        break;
-                    }
-                    case DataType::QASYMM8_SIGNED:
-                    {
-                        pixelValue = PixelValue(127, input->info()->data_type(), input->info()->quantization_info());
-                        break;
-                    }
-                    default:
-                    {
-                        ARM_COMPUTE_ERROR("Unsupported DataType");
-                    }
-                }
+                pixelValue             = std::get<1>(get_min_max(input->info()->data_type()));
                 break;
             case ReductionOperation::MAX:
                 first_kernel_op        = ReductionOperation::MAX;
                 intermediate_kernel_op = ReductionOperation::MAX;
                 last_kernel_op         = ReductionOperation::MAX;
-                switch(input->info()->data_type())
-                {
-                    case DataType::F32:
-                    {
-                        pixelValue = PixelValue(-std::numeric_limits<float>::max());
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        pixelValue = PixelValue(static_cast<half>(-65504.0f));
-                        break;
-                    }
-                    case DataType::QASYMM8:
-                    {
-                        pixelValue = std::get<0>(get_min_max(input->info()->data_type()));
-                        break;
-                    }
-                    case DataType::QASYMM8_SIGNED:
-                    {
-                        pixelValue = PixelValue(-128, input->info()->data_type(), input->info()->quantization_info());
-                        break;
-                    }
-                    default:
-                    {
-                        ARM_COMPUTE_ERROR("Unsupported DataType");
-                    }
-                }
+                pixelValue             = std::get<0>(get_min_max(input->info()->data_type()));
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2020-07-06 11:27:21 +0100
committer	Michele Di Giorgio <michele.digiorgio@arm.com>	2020-07-20 12:39:29 +0000
commit	f6f7876e9ee8b58a8a6b335b032d554412fa3983 (patch)
tree	669c86bfc60ec99965151022a4112c53116b06c0 /src
parent	954051f449788f22eb605b126f71af923950ca29 (diff)
download	ComputeLibrary-f6f7876e9ee8b58a8a6b335b032d554412fa3983.tar.gz