From ef516e8bb8eb7f55b410268587f3b88b77e2fd8e Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 30 Apr 2021 14:46:05 +0100
Subject: Rename Quantization/Dequantization kernels/operators to imperative
 mood

Renames the following kernels/functions
 - [Cl|Cpu]DequantizationKernel -> [Cl|Cpu]DequantizeKernel
 - [Cl|Cpu]Dequantization -> [Cl|Cpu]CpuDequantize
 - [Cl|Cpu]QuantizationKernel -> [Cl|Cpu]QuantizeKernel
 - [Cl|Cpu]Quantization -> [Cl|Cpu]Quantize

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ic3c5eb3b7fe28f807294d159830eef99c2dd6219
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5566
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |  16 +-
 SConscript                                         |   8 +-
 .../runtime/CL/functions/CLDequantizationLayer.h   |   2 +-
 .../runtime/CL/functions/CLQuantizationLayer.h     |   2 +-
 .../runtime/NEON/functions/NEDequantizationLayer.h |   2 +-
 .../runtime/NEON/functions/NEQuantizationLayer.h   |   2 +-
 src/core/cpu/kernels/CpuDequantizationKernel.cpp   | 400 ---------------------
 src/core/cpu/kernels/CpuDequantizationKernel.h     |  65 ----
 src/core/cpu/kernels/CpuDequantizeKernel.cpp       | 400 +++++++++++++++++++++
 src/core/cpu/kernels/CpuDequantizeKernel.h         |  64 ++++
 src/core/cpu/kernels/CpuQuantizationKernel.cpp     | 271 --------------
 src/core/cpu/kernels/CpuQuantizationKernel.h       |  92 -----
 src/core/cpu/kernels/CpuQuantizeKernel.cpp         | 266 ++++++++++++++
 src/core/cpu/kernels/CpuQuantizeKernel.h           |  90 +++++
 src/core/gpu/cl/kernels/ClDequantizationKernel.cpp | 155 --------
 src/core/gpu/cl/kernels/ClDequantizationKernel.h   |  69 ----
 src/core/gpu/cl/kernels/ClDequantizeKernel.cpp     | 153 ++++++++
 src/core/gpu/cl/kernels/ClDequantizeKernel.h       |  65 ++++
 src/core/gpu/cl/kernels/ClQuantizationKernel.cpp   | 177 ---------
 src/core/gpu/cl/kernels/ClQuantizationKernel.h     |  73 ----
 src/core/gpu/cl/kernels/ClQuantizeKernel.cpp       | 175 +++++++++
 src/core/gpu/cl/kernels/ClQuantizeKernel.h         |  70 ++++
 src/runtime/CL/functions/CLDequantizationLayer.cpp |  12 +-
 src/runtime/CL/functions/CLQuantizationLayer.cpp   |  12 +-
 .../NEON/functions/NEDequantizationLayer.cpp       |  12 +-
 src/runtime/NEON/functions/NEQuantizationLayer.cpp |  12 +-
 src/runtime/cpu/operators/CpuDequantization.cpp    |  54 ---
 src/runtime/cpu/operators/CpuDequantization.h      |  64 ----
 src/runtime/cpu/operators/CpuDequantize.cpp        |  54 +++
 src/runtime/cpu/operators/CpuDequantize.h          |  58 +++
 src/runtime/cpu/operators/CpuQuantization.cpp      |  58 ---
 src/runtime/cpu/operators/CpuQuantization.h        |  69 ----
 src/runtime/cpu/operators/CpuQuantize.cpp          |  58 +++
 src/runtime/cpu/operators/CpuQuantize.h            |  58 +++
 src/runtime/gpu/cl/operators/ClDequantization.cpp  |  54 ---
 src/runtime/gpu/cl/operators/ClDequantization.h    |  62 ----
 src/runtime/gpu/cl/operators/ClDequantize.cpp      |  53 +++
 src/runtime/gpu/cl/operators/ClDequantize.h        |  60 ++++
 src/runtime/gpu/cl/operators/ClQuantization.cpp    |  53 ---
 src/runtime/gpu/cl/operators/ClQuantization.h      |  67 ----
 src/runtime/gpu/cl/operators/ClQuantize.cpp        |  53 +++
 src/runtime/gpu/cl/operators/ClQuantize.h          |  62 ++++
 42 files changed, 1779 insertions(+), 1823 deletions(-)
 delete mode 100644 src/core/cpu/kernels/CpuDequantizationKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDequantizationKernel.h
 create mode 100644 src/core/cpu/kernels/CpuDequantizeKernel.cpp
 create mode 100644 src/core/cpu/kernels/CpuDequantizeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuQuantizationKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuQuantizationKernel.h
 create mode 100644 src/core/cpu/kernels/CpuQuantizeKernel.cpp
 create mode 100644 src/core/cpu/kernels/CpuQuantizeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClDequantizationKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClDequantizationKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClDequantizeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClQuantizationKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClQuantizationKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClQuantizeKernel.h
 delete mode 100644 src/runtime/cpu/operators/CpuDequantization.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuDequantization.h
 create mode 100644 src/runtime/cpu/operators/CpuDequantize.cpp
 create mode 100644 src/runtime/cpu/operators/CpuDequantize.h
 delete mode 100644 src/runtime/cpu/operators/CpuQuantization.cpp
 delete mode 100644 src/runtime/cpu/operators/CpuQuantization.h
 create mode 100644 src/runtime/cpu/operators/CpuQuantize.cpp
 create mode 100644 src/runtime/cpu/operators/CpuQuantize.h
 delete mode 100644 src/runtime/gpu/cl/operators/ClDequantization.cpp
 delete mode 100644 src/runtime/gpu/cl/operators/ClDequantization.h
 create mode 100644 src/runtime/gpu/cl/operators/ClDequantize.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClDequantize.h
 delete mode 100644 src/runtime/gpu/cl/operators/ClQuantization.cpp
 delete mode 100644 src/runtime/gpu/cl/operators/ClQuantization.h
 create mode 100644 src/runtime/gpu/cl/operators/ClQuantize.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClQuantize.h

diff --git a/Android.bp b/Android.bp
index 046b1c08a5..1354b365e4 100644
--- a/Android.bp
+++ b/Android.bp
@@ -299,7 +299,7 @@ cc_library_static {
         "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
         "src/core/cpu/kernels/CpuCopyKernel.cpp",
         "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp",
-        "src/core/cpu/kernels/CpuDequantizationKernel.cpp",
+        "src/core/cpu/kernels/CpuDequantizeKernel.cpp",
         "src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp",
         "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp",
         "src/core/cpu/kernels/CpuElementwiseKernel.cpp",
@@ -310,7 +310,7 @@ cc_library_static {
         "src/core/cpu/kernels/CpuPermuteKernel.cpp",
         "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp",
         "src/core/cpu/kernels/CpuPoolingKernel.cpp",
-        "src/core/cpu/kernels/CpuQuantizationKernel.cpp",
+        "src/core/cpu/kernels/CpuQuantizeKernel.cpp",
         "src/core/cpu/kernels/CpuReshapeKernel.cpp",
         "src/core/cpu/kernels/CpuScaleKernel.cpp",
         "src/core/cpu/kernels/CpuSoftmaxKernel.cpp",
@@ -360,7 +360,7 @@ cc_library_static {
         "src/core/gpu/cl/kernels/ClCopyKernel.cpp",
         "src/core/gpu/cl/kernels/ClCropKernel.cpp",
         "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
-        "src/core/gpu/cl/kernels/ClDequantizationKernel.cpp",
+        "src/core/gpu/cl/kernels/ClDequantizeKernel.cpp",
         "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp",
         "src/core/gpu/cl/kernels/ClElementwiseKernel.cpp",
         "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
@@ -370,7 +370,7 @@ cc_library_static {
         "src/core/gpu/cl/kernels/ClMulKernel.cpp",
         "src/core/gpu/cl/kernels/ClPermuteKernel.cpp",
         "src/core/gpu/cl/kernels/ClPoolingKernel.cpp",
-        "src/core/gpu/cl/kernels/ClQuantizationKernel.cpp",
+        "src/core/gpu/cl/kernels/ClQuantizeKernel.cpp",
         "src/core/gpu/cl/kernels/ClReshapeKernel.cpp",
         "src/core/gpu/cl/kernels/ClScaleKernel.cpp",
         "src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp",
@@ -634,7 +634,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuCopy.cpp",
         "src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp",
         "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp",
-        "src/runtime/cpu/operators/CpuDequantization.cpp",
+        "src/runtime/cpu/operators/CpuDequantize.cpp",
         "src/runtime/cpu/operators/CpuDirectConvolution.cpp",
         "src/runtime/cpu/operators/CpuElementwise.cpp",
         "src/runtime/cpu/operators/CpuElementwiseUnary.cpp",
@@ -644,7 +644,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuPermute.cpp",
         "src/runtime/cpu/operators/CpuPooling.cpp",
         "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp",
-        "src/runtime/cpu/operators/CpuQuantization.cpp",
+        "src/runtime/cpu/operators/CpuQuantize.cpp",
         "src/runtime/cpu/operators/CpuReshape.cpp",
         "src/runtime/cpu/operators/CpuScale.cpp",
         "src/runtime/cpu/operators/CpuSoftmax.cpp",
@@ -656,7 +656,7 @@ cc_library_static {
         "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp",
         "src/runtime/gpu/cl/operators/ClCopy.cpp",
         "src/runtime/gpu/cl/operators/ClCrop.cpp",
-        "src/runtime/gpu/cl/operators/ClDequantization.cpp",
+        "src/runtime/gpu/cl/operators/ClDequantize.cpp",
         "src/runtime/gpu/cl/operators/ClDirectConvolution.cpp",
         "src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp",
         "src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp",
@@ -667,7 +667,7 @@ cc_library_static {
         "src/runtime/gpu/cl/operators/ClPRelu.cpp",
         "src/runtime/gpu/cl/operators/ClPermute.cpp",
         "src/runtime/gpu/cl/operators/ClPooling.cpp",
-        "src/runtime/gpu/cl/operators/ClQuantization.cpp",
+        "src/runtime/gpu/cl/operators/ClQuantize.cpp",
         "src/runtime/gpu/cl/operators/ClReshape.cpp",
         "src/runtime/gpu/cl/operators/ClScale.cpp",
         "src/runtime/gpu/cl/operators/ClSoftmax.cpp",
diff --git a/SConscript b/SConscript
index da92409867..e94ff1ed6d 100644
--- a/SConscript
+++ b/SConscript
@@ -307,13 +307,13 @@ if env['neon']:
                         'src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp',
                         'src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp',
                         'src/core/cpu/kernels/CpuCopyKernel.cpp',
-                        'src/core/cpu/kernels/CpuDequantizationKernel.cpp',
+                        'src/core/cpu/kernels/CpuDequantizeKernel.cpp',
                         'src/core/cpu/kernels/CpuElementwiseKernel.cpp',
                         'src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp',
                         'src/core/cpu/kernels/CpuFillKernel.cpp',
                         'src/core/cpu/kernels/CpuFloorKernel.cpp',
                         'src/core/cpu/kernels/CpuMulKernel.cpp',
-                        'src/core/cpu/kernels/CpuQuantizationKernel.cpp',
+                        'src/core/cpu/kernels/CpuQuantizeKernel.cpp',
                         'src/core/cpu/kernels/CpuScaleKernel.cpp',
                         'src/core/cpu/kernels/CpuSoftmaxKernel.cpp',
                         'src/core/cpu/kernels/CpuSubKernel.cpp',
@@ -354,13 +354,13 @@ if env['neon']:
                           'src/runtime/cpu/operators/CpuConcatenate.cpp',
                           'src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp',
                           'src/runtime/cpu/operators/CpuCopy.cpp',
-                          'src/runtime/cpu/operators/CpuDequantization.cpp',
+                          'src/runtime/cpu/operators/CpuDequantize.cpp',
                           'src/runtime/cpu/operators/CpuElementwise.cpp',
                           'src/runtime/cpu/operators/CpuElementwiseUnary.cpp',
                           'src/runtime/cpu/operators/CpuFill.cpp',
                           'src/runtime/cpu/operators/CpuFloor.cpp',
                           'src/runtime/cpu/operators/CpuMul.cpp',
-                          'src/runtime/cpu/operators/CpuQuantization.cpp',
+                          'src/runtime/cpu/operators/CpuQuantize.cpp',
                           'src/runtime/cpu/operators/CpuReshape.cpp',
                           'src/runtime/cpu/operators/CpuScale.cpp',
                           'src/runtime/cpu/operators/CpuSoftmax.cpp',
diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index 601c13d0e4..b01fe9eb14 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
@@ -36,7 +36,7 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 
-/** Basic function to run @ref opencl::ClDequantization that dequantizes an input tensor */
+/** Basic function to run @ref opencl::ClDequantize that dequantizes an input tensor */
 class CLDequantizationLayer : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
index a61735cb97..6543496d93 100644
--- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
@@ -37,7 +37,7 @@ class ITensorInfo;
 
 /** Basic function to simulate a quantization layer. This function calls the following CL kernels:
  *
- * -# @ref opencl::ClQuantization
+ * -# @ref opencl::ClQuantize
  *
  * @note The implementation supports only 3D input tensors.
  *
diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index 91ed056cf3..8b49930ef5 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
@@ -35,7 +35,7 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to run @ref cpu::CpuDequantization that dequantizes an input tensor */
+/** Basic function to run @ref cpu::CpuDequantize that dequantizes an input tensor */
 class NEDequantizationLayer : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index eeca2bb1db..7bf97e28a5 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -35,7 +35,7 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to run a quantization layer using @ref cpu::CpuQuantization */
+/** Basic function to run a quantization layer using @ref cpu::CpuQuantize */
 class NEQuantizationLayer : public IFunction
 {
 public:
diff --git a/src/core/cpu/kernels/CpuDequantizationKernel.cpp b/src/core/cpu/kernels/CpuDequantizationKernel.cpp
deleted file mode 100644
index 2aa9fb9068..0000000000
--- a/src/core/cpu/kernels/CpuDequantizationKernel.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDequantizationKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
-    if(dst->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x4_t &v)
-{
-    ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x4_t &v)
-{
-    wrapper::vstore(ptr, v.val[0]);
-    wrapper::vstore(ptr + 4, v.val[1]);
-    wrapper::vstore(ptr + 8, v.val[2]);
-    wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
-{
-    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-    wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x2_t &v)
-{
-    ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x2_t &v)
-{
-    wrapper::vstore(ptr, v.val[0]);
-    wrapper::vstore(ptr + 4, v.val[1]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
-{
-    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename TOut, typename TIn>
-void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo  = input->info()->quantization_info().uniform();
-    const float                    scale  = qinfo.scale;
-    const int32_t                  offset = qinfo.offset;
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale, offset);
-
-            store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto val       = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
-{
-    const auto scale = input->info()->quantization_info().scale();
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Reset first dimension to handle tail calculations manually
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win);
-    Iterator out(output, win);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale[id.z()]);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
-{
-    const auto scale = input->info()->quantization_info().scale();
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Reset first dimension to handle tail calculations manually
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win);
-    Iterator out(output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const float32x4x4_t vscale =
-            {
-                {
-                    scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
-                    scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
-                    scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
-                    scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
-                }
-            };
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, vscale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
-    const float                    scale = qinfo.scale;
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
-    const float                    scale = qinfo.scale;
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize_int16(vin, scale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int16_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
-{
-    switch(input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            run_dequantization_qasymm8<T, uint8_t>(input, output, window);
-            break;
-        case DataType::QASYMM8_SIGNED:
-            run_dequantization_qasymm8<T, int8_t>(input, output, window);
-            break;
-        case DataType::QSYMM8_PER_CHANNEL:
-            input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
-            break;
-        case DataType::QSYMM8:
-            run_dequantization_qsymm8<T>(input, output, window);
-            break;
-        case DataType::QSYMM16:
-            run_dequantization_qsymm16<T>(input, output, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-}
-} // namespace
-
-void CpuDequantizationKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuDequantizationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void CpuDequantizationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(dst->info()->data_type())
-    {
-        case DataType::F32:
-            run_dequantization_core<float>(src, dst, window);
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            run_dequantization_core<float16_t>(src, dst, window);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-}
-const char *CpuDequantizationKernel::name() const
-{
-    return "CpuDequantizationKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDequantizationKernel.h b/src/core/cpu/kernels/CpuDequantizationKernel.h
deleted file mode 100644
index 8ac807097c..0000000000
--- a/src/core/cpu/kernels/CpuDequantizationKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZATIONKERNEL_H
-#define ARM_COMPUTE_CPU_DEQUANTIZATIONKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the dequantization layer kernel. */
-class CpuDequantizationKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuDequantizationKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizationKernel);
-    /** Set input, output tensors.
-     *
-     * @param[in]  src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDequantizationKernel
-     *
-     * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] dst Destination tensor info. Data types supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DEQUANTIZATIONKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.cpp b/src/core/cpu/kernels/CpuDequantizeKernel.cpp
new file mode 100644
index 0000000000..42b5439697
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDequantizeKernel.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuDequantizeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+
+    if(dst->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+
+template <typename T>
+inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+    ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <>
+inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+    wrapper::vstore(ptr, v.val[0]);
+    wrapper::vstore(ptr + 4, v.val[1]);
+    wrapper::vstore(ptr + 8, v.val[2]);
+    wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+    wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T>
+inline void store_result(T *ptr, const float32x4x2_t &v)
+{
+    ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <>
+inline void store_result<float>(float *ptr, const float32x4x2_t &v)
+{
+    wrapper::vstore(ptr, v.val[0]);
+    wrapper::vstore(ptr + 4, v.val[1]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
+{
+    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename TOut, typename TIn>
+void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo  = input->info()->quantization_info().uniform();
+    const float                    scale  = qinfo.scale;
+    const int32_t                  offset = qinfo.offset;
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize(vin, scale, offset);
+
+            store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            auto val       = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
+        }
+    },
+    in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
+{
+    const auto scale = input->info()->quantization_info().scale();
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Reset first dimension to handle tail calculations manually
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win);
+    Iterator out(output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize(vin, scale[id.z()]);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int8_t val     = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
+        }
+    },
+    in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
+{
+    const auto scale = input->info()->quantization_info().scale();
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Reset first dimension to handle tail calculations manually
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win);
+    Iterator out(output, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const float32x4x4_t vscale =
+            {
+                {
+                    scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
+                    scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
+                    scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
+                    scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
+                }
+            };
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize(vin, vscale);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int8_t val     = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
+        }
+    },
+    in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const float                    scale = qinfo.scale;
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize(vin, scale);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int8_t val     = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
+        }
+    },
+    in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const float                    scale = qinfo.scale;
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize_int16(vin, scale);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int16_t val    = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
+        }
+    },
+    in, out);
+}
+
+template <typename T>
+void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
+{
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_dequantization_qasymm8<T, uint8_t>(input, output, window);
+            break;
+        case DataType::QASYMM8_SIGNED:
+            run_dequantization_qasymm8<T, int8_t>(input, output, window);
+            break;
+        case DataType::QSYMM8_PER_CHANNEL:
+            input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
+            break;
+        case DataType::QSYMM8:
+            run_dequantization_qsymm8<T>(input, output, window);
+            break;
+        case DataType::QSYMM16:
+            run_dequantization_qsymm16<T>(input, output, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+}
+} // namespace
+
+void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
+
+    ICpuKernel::configure(win);
+}
+
+Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    switch(dst->info()->data_type())
+    {
+        case DataType::F32:
+            run_dequantization_core<float>(src, dst, window);
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            run_dequantization_core<float16_t>(src, dst, window);
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+}
+const char *CpuDequantizeKernel::name() const
+{
+    return "CpuDequantizeKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.h b/src/core/cpu/kernels/CpuDequantizeKernel.h
new file mode 100644
index 0000000000..798f32cec7
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDequantizeKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the dequantization layer kernel. */
+class CpuDequantizeKernel : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuDequantizeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel);
+    /** Set input, output tensors.
+     *
+     * @param[in]  src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuDequantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuQuantizationKernel.cpp b/src/core/cpu/kernels/CpuQuantizationKernel.cpp
deleted file mode 100644
index 9b1e017275..0000000000
--- a/src/core/cpu/kernels/CpuQuantizationKernel.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuQuantizationKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
-
-#include <arm_neon.h>
-#include <map>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto window_step = 16;
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    return Status{};
-}
-
-template <typename T>
-inline float32x4x4_t load_value(const T *input_ptr)
-{
-    using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
-    return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
-}
-
-template <>
-inline float32x4x4_t load_value(const float *input_ptr)
-{
-    return { wrapper::vloadq(input_ptr),
-             wrapper::vloadq(input_ptr + 4),
-             wrapper::vloadq(input_ptr + 8),
-             wrapper::vloadq(input_ptr + 12) };
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4x4_t load_value(const float16_t *input_ptr)
-{
-    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
-}
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <typename element_type>
-using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
-
-template <typename quantized_type>
-vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
-
-template <>
-vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize(qv, qi);
-}
-
-template <>
-vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize_signed(qv, qi);
-}
-
-} // namespace
-
-CpuQuantizationKernel::CpuQuantizationKernel()
-    : _func(nullptr)
-{
-}
-
-void CpuQuantizationKernel::configure(ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    static const std::map<std::string, QuantizationFunctionExecutorPtr> quant_map =
-    {
-        { "op_QASYMM8_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
-        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<uint8_t, int8_t> },
-        { "op_QASYMM8_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<uint8_t> },
-
-        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<int8_t, uint8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<int8_t, int8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<int8_t> },
-
-        { "op_F32_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<float, uint8_t> },
-        { "op_F32_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<float, int8_t> },
-        { "op_F32_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<float> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "op_F16_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<float16_t, uint8_t> },
-        { "op_F16_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<float16_t, int8_t> },
-        { "op_F16_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-    };
-
-    std::string function_to_call("op_");
-    function_to_call += string_from_data_type(src->data_type()) + "_";
-    function_to_call += string_from_data_type(dst->data_type());
-
-    auto it = quant_map.find(function_to_call);
-
-    if(it == quant_map.end())
-    {
-        ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
-    }
-    _func = it->second;
-
-    // Configure kernel window
-    Window win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
-}
-
-Status CpuQuantizationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-template <typename TIn, typename TOut>
-void CpuQuantizationKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
-}
-
-template <typename T>
-void CpuQuantizationKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
-            vst1q_u16(&output_ptr[x], tmp.val[0]);
-            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
-}
-
-void CpuQuantizationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, dst, window);
-}
-
-const char *CpuQuantizationKernel::name() const
-{
-    return "CpuQuantizationKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuQuantizationKernel.h b/src/core/cpu/kernels/CpuQuantizationKernel.h
deleted file mode 100644
index 51d9a4e94f..0000000000
--- a/src/core/cpu/kernels/CpuQuantizationKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_QUANTIZATIONKERNEL_H
-#define ARM_COMPUTE_CPU_QUANTIZATIONKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors
- *
- */
-class CpuQuantizationKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuQuantizationKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizationKernel);
-    /** Set the input, output.
-     *
-     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuQuantizationKernel
-     *
-     * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] dst Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizationFunctionExecutorPtr = void (CpuQuantizationKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename TIn, typename TOut>
-    void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply QASYMM16 quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
-
-    QuantizationFunctionExecutorPtr _func;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_QUANTIZATIONKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.cpp b/src/core/cpu/kernels/CpuQuantizeKernel.cpp
new file mode 100644
index 0000000000..8ca81e8b11
--- /dev/null
+++ b/src/core/cpu/kernels/CpuQuantizeKernel.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuQuantizeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+#include <map>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+constexpr auto window_step = 16;
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+
+    return Status{};
+}
+
+template <typename T>
+inline float32x4x4_t load_value(const T *input_ptr)
+{
+    using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
+}
+
+template <>
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+    return { wrapper::vloadq(input_ptr),
+             wrapper::vloadq(input_ptr + 4),
+             wrapper::vloadq(input_ptr + 8),
+             wrapper::vloadq(input_ptr + 12) };
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float32x4x4_t load_value(const float16_t *input_ptr)
+{
+    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
+             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename element_type>
+using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
+
+template <typename quantized_type>
+vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
+
+template <>
+vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    return vquantize(qv, qi);
+}
+
+template <>
+vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    return vquantize_signed(qv, qi);
+}
+
+} // namespace
+
+void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
+    {
+        { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
+        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
+        { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
+
+        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
+        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
+        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
+
+        { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
+        { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
+        { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
+        { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
+        { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
+    };
+
+    std::string function_to_call("op_");
+    function_to_call += string_from_data_type(src->data_type()) + "_";
+    function_to_call += string_from_data_type(dst->data_type());
+
+    auto it = quant_map.find(function_to_call);
+
+    if(it == quant_map.end())
+    {
+        ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
+    }
+    _func = it->second;
+
+    // Configure kernel window
+    Window win_config = calculate_max_window(*src, Steps());
+    ICpuKernel::configure(win_config);
+}
+
+Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+    }
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step); x += window_step)
+        {
+            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+        }
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+        }
+    },
+    input, output);
+}
+
+template <typename T>
+void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+    }
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step); x += window_step)
+        {
+            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+            vst1q_u16(&output_ptr[x], tmp.val[0]);
+            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+        }
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+        }
+    },
+    input, output);
+}
+
+void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+    (this->*_func)(src, dst, window);
+}
+
+const char *CpuQuantizeKernel::name() const
+{
+    return "CpuQuantizeKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.h b/src/core/cpu/kernels/CpuQuantizeKernel.h
new file mode 100644
index 0000000000..d3422d3fbd
--- /dev/null
+++ b/src/core/cpu/kernels/CpuQuantizeKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors
+ */
+class CpuQuantizeKernel : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuQuantizeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel);
+    /** Set the input, output.
+     *
+     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuQuantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Common signature for all the specialised @ref CpuQuantizeKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename TIn, typename TOut>
+    void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window);
+    /** Function to apply QASYMM16 quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename T>
+    void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
+
+    QuantizeFunctionExecutorPtr _func{ nullptr };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDequantizationKernel.cpp b/src/core/gpu/cl/kernels/ClDequantizationKernel.cpp
deleted file mode 100644
index 6421a08206..0000000000
--- a/src/core/gpu/cl/kernels/ClDequantizationKernel.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDequantizationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
-    if(dst->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClDequantizationKernel::ClDequantizationKernel()
-{
-}
-
-void ClDequantizationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    const int  vec_size_x     = 16 / dst->element_size();
-    const int  output_width_x = dst->tensor_shape().x();
-    const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
-    const bool  is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type());
-    std::string kernel_name              = "dequantization_layer";
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
-    {
-        const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
-        const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
-        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
-    }
-    else
-    {
-        kernel_name += "_per_channel";
-        kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
-    }
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-
-    // Create kernel name
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst);
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClDequantizationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClDequantizationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
-
-    // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice      = new_window.first_slice_window_3D();
-
-    if(is_quantized_per_channel)
-    {
-        unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
-        _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDequantizationKernel.h b/src/core/gpu/cl/kernels/ClDequantizationKernel.h
deleted file mode 100644
index 3ccf90c204..0000000000
--- a/src/core/gpu/cl/kernels/ClDequantizationKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEQUANTIZATION_KERNEL_H
-#define ARM_COMPUTE_CL_DEQUANTIZATION_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the dequantization layer kernel. */
-class ClDequantizationKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClDequantizationKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizationKernel);
-    /** Initialise the kernel's input and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst             Destination tensor info. Data types supported: F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClDequantizationKernel
-     *
-     * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] dst Output tensor info. Data types supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_DEQUANTIZATION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
new file mode 100644
index 0000000000..f2758b759f
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+
+    if(dst->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
+
+    auto padding_info = get_padding_info({ src, dst });
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    const int  vec_size_x     = 16 / dst->element_size();
+    const int  output_width_x = dst->tensor_shape().x();
+    const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+    const bool  is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type());
+    std::string kernel_name              = "dequantization_layer";
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    if(!is_quantized_per_channel)
+    {
+        const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
+        const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
+        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
+        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
+    }
+    else
+    {
+        kernel_name += "_per_channel";
+        kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
+    }
+
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+
+    // Create kernel name
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst);
+    if(multi_access_x)
+    {
+        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
+
+    // Collapse windo
+    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
+    Window slice      = new_window.first_slice_window_3D();
+
+    if(is_quantized_per_channel)
+    {
+        unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
+        _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(new_window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.h b/src/core/gpu/cl/kernels/ClDequantizeKernel.h
new file mode 100644
index 0000000000..33e0164cc9
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClDequantizeKernel.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the dequantization layer kernel. */
+class ClDequantizeKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClDequantizeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel);
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst             Destination tensor info. Data types supported: F16/F32.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClDequantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClQuantizationKernel.cpp b/src/core/gpu/cl/kernels/ClQuantizationKernel.cpp
deleted file mode 100644
index 9926123529..0000000000
--- a/src/core/gpu/cl/kernels/ClQuantizationKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClQuantizationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
-    // Output must always be initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    return Status{};
-}
-} // namespace
-
-ClQuantizationKernel::ClQuantizationKernel()
-{
-}
-
-void ClQuantizationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    const int  vec_size_x     = 16 / src->element_size();
-    const int  input_width_x  = src->tensor_shape().x();
-    const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
-    const UniformQuantizationInfo qinfo            = dst->quantization_info().uniform();
-    const DataType                output_data_type = dst->data_type();
-
-    float   scale_to_apply  = qinfo.scale;
-    int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(src->data_type()))
-    {
-        /*
-         * In case of requantization of a quantized input tensor to an output tensor with another quantization
-         * instead of of apply dequantization and then a quantization functions, we just compute new scale and
-         * offset to apply.
-         *
-         * Assuming:
-         *   - q_i as input quantized value
-         *   - q_o as output quantized value
-         *   - z_i as input quantization offset value
-         *   - z_o as output quantization offset value
-         *   - s_i as input quantization scale value
-         *   - s_o as output quantization scale value
-         *   - z_n as new quantization offset value
-         *   - s_n as new quantization scale value
-         *
-         * q_o = ( q_i - z_i ) * s_i / s_o + z_o
-         *
-         * We can rewrite the formula as:
-         *
-         * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
-         *
-         * q_o = q_i / s_n + z_n
-         *
-         * Where:
-         *
-         * s_n = s_o / s_i
-         *
-         * z_n = - z_i * s_i / s_o + z_o
-         *
-         */
-        const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform();
-        scale_to_apply /= qinfo_in.scale;
-        // In order to minimize flooring we convert the offset to a float,
-        // then compute the new offset in the float domain,
-        // finally we convert it back as int32_t
-        offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
-    }
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
-    build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
-    build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
-
-    _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClQuantizationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClQuantizationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClQuantizationKernel.h b/src/core/gpu/cl/kernels/ClQuantizationKernel.h
deleted file mode 100644
index 20822cf9c9..0000000000
--- a/src/core/gpu/cl/kernels/ClQuantizationKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_QUANTIZATION_KERNEL_H
-#define ARM_COMPUTE_CL_QUANTIZATION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors.
- */
-class ClQuantizationKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClQuantizationKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizationKernel);
-    /** Set the input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst             Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClQuantizationKernel
-     *
-     * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_QUANTIZATION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
new file mode 100644
index 0000000000..48d351d536
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+    // Output must always be initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+
+    return Status{};
+}
+} // namespace
+
+void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    auto padding_info = get_padding_info({ src, dst });
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    const int  vec_size_x     = 16 / src->element_size();
+    const int  input_width_x  = src->tensor_shape().x();
+    const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+    const UniformQuantizationInfo qinfo            = dst->quantization_info().uniform();
+    const DataType                output_data_type = dst->data_type();
+
+    float   scale_to_apply  = qinfo.scale;
+    int32_t offset_to_apply = qinfo.offset;
+    if(is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        /*
+         * In case of requantization of a quantized input tensor to an output tensor with another quantization
+         * instead of of apply dequantization and then a quantization functions, we just compute new scale and
+         * offset to apply.
+         *
+         * Assuming:
+         *   - q_i as input quantized value
+         *   - q_o as output quantized value
+         *   - z_i as input quantization offset value
+         *   - z_o as output quantization offset value
+         *   - s_i as input quantization scale value
+         *   - s_o as output quantization scale value
+         *   - z_n as new quantization offset value
+         *   - s_n as new quantization scale value
+         *
+         * q_o = ( q_i - z_i ) * s_i / s_o + z_o
+         *
+         * We can rewrite the formula as:
+         *
+         * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
+         *
+         * q_o = q_i / s_n + z_n
+         *
+         * Where:
+         *
+         * s_n = s_o / s_i
+         *
+         * z_n = - z_i * s_i / s_o + z_o
+         *
+         */
+        const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform();
+        scale_to_apply /= qinfo_in.scale;
+        // In order to minimize flooring we convert the offset to a float,
+        // then compute the new offset in the float domain,
+        // finally we convert it back as int32_t
+        offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
+    }
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
+    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
+    build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
+    build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
+
+    _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.h b/src/core/gpu/cl/kernels/ClQuantizeKernel.h
new file mode 100644
index 0000000000..8d37f33032
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClQuantizeKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors.
+ */
+class ClQuantizeKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClQuantizeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel);
+    /** Set the input, output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst             Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClQuantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_QUANTIZE_KERNEL_H */
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index e0381f90ae..3b104017e7 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -27,15 +27,15 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClDequantization.h"
+#include "src/runtime/gpu/cl/operators/ClDequantize.h"
 
 namespace arm_compute
 {
 struct CLDequantizationLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClDequantization> op{ nullptr };
+    const ICLTensor                      *src{ nullptr };
+    ICLTensor                            *dst{ nullptr };
+    std::unique_ptr<opencl::ClDequantize> op{ nullptr };
 };
 
 CLDequantizationLayer::CLDequantizationLayer()
@@ -54,13 +54,13 @@ void CLDequantizationLayer::configure(const CLCompileContext &compile_context, c
     _impl->src = input;
     _impl->dst = output;
 
-    _impl->op = std::make_unique<opencl::ClDequantization>();
+    _impl->op = std::make_unique<opencl::ClDequantize>();
     _impl->op->configure(compile_context, input->info(), output->info());
 }
 
 Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return opencl::ClDequantization::validate(input, output);
+    return opencl::ClDequantize::validate(input, output);
 }
 
 void CLDequantizationLayer::run()
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index 1f6ddb6014..e6451b2eb4 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -26,15 +26,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClQuantization.h"
+#include "src/runtime/gpu/cl/operators/ClQuantize.h"
 
 namespace arm_compute
 {
 struct CLQuantizationLayer::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClQuantization> op{ nullptr };
+    const ICLTensor                    *src{ nullptr };
+    ICLTensor                          *dst{ nullptr };
+    std::unique_ptr<opencl::ClQuantize> op{ nullptr };
 };
 
 CLQuantizationLayer::CLQuantizationLayer()
@@ -53,13 +53,13 @@ void CLQuantizationLayer::configure(const CLCompileContext &compile_context, con
     _impl->src = input;
     _impl->dst = output;
 
-    _impl->op = std::make_unique<opencl::ClQuantization>();
+    _impl->op = std::make_unique<opencl::ClQuantize>();
     _impl->op->configure(compile_context, input->info(), output->info());
 }
 
 Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return opencl::ClQuantization::validate(input, output);
+    return opencl::ClQuantize::validate(input, output);
 }
 
 void CLQuantizationLayer::run()
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 210fbe0eb2..91e37594af 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -26,15 +26,15 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuDequantization.h"
+#include "src/runtime/cpu/operators/CpuDequantize.h"
 
 namespace arm_compute
 {
 struct NEDequantizationLayer::Impl
 {
-    const ITensor                          *src{ nullptr };
-    ITensor                                *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDequantization> op{ nullptr };
+    const ITensor                      *src{ nullptr };
+    ITensor                            *dst{ nullptr };
+    std::unique_ptr<cpu::CpuDequantize> op{ nullptr };
 };
 
 NEDequantizationLayer::NEDequantizationLayer()
@@ -47,13 +47,13 @@ void NEDequantizationLayer::configure(const ITensor *input, ITensor *output)
 {
     _impl->src = input;
     _impl->dst = output;
-    _impl->op  = std::make_unique<cpu::CpuDequantization>();
+    _impl->op  = std::make_unique<cpu::CpuDequantize>();
     _impl->op->configure(input->info(), output->info());
 }
 
 Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return cpu::CpuDequantization::validate(input, output);
+    return cpu::CpuDequantize::validate(input, output);
 }
 
 void NEDequantizationLayer::run()
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 58ba68725b..e607917615 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,15 +26,15 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuQuantization.h"
+#include "src/runtime/cpu/operators/CpuQuantize.h"
 
 namespace arm_compute
 {
 struct NEQuantizationLayer::Impl
 {
-    const ITensor                        *src{ nullptr };
-    ITensor                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuQuantization> op{ nullptr };
+    const ITensor                    *src{ nullptr };
+    ITensor                          *dst{ nullptr };
+    std::unique_ptr<cpu::CpuQuantize> op{ nullptr };
 };
 
 NEQuantizationLayer::NEQuantizationLayer()
@@ -45,14 +45,14 @@ NEQuantizationLayer::~NEQuantizationLayer() = default;
 
 Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return cpu::CpuQuantization::validate(input, output);
+    return cpu::CpuQuantize::validate(input, output);
 }
 
 void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
 {
     _impl->src = input;
     _impl->dst = output;
-    _impl->op  = std::make_unique<cpu::CpuQuantization>();
+    _impl->op  = std::make_unique<cpu::CpuQuantize>();
     _impl->op->configure(input->info(), output->info());
 }
 
diff --git a/src/runtime/cpu/operators/CpuDequantization.cpp b/src/runtime/cpu/operators/CpuDequantization.cpp
deleted file mode 100644
index 0a3f602da1..0000000000
--- a/src/runtime/cpu/operators/CpuDequantization.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDequantization.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDequantizationKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuDequantization::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuDequantizationKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuDequantization::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::CpuDequantizationKernel::validate(src, dst);
-}
-
-void CpuDequantization::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    prepare(tensors);
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDequantization.h b/src/runtime/cpu/operators/CpuDequantization.h
deleted file mode 100644
index 22f8114149..0000000000
--- a/src/runtime/cpu/operators/CpuDequantization.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZATION_H
-#define ARM_COMPUTE_CPU_DEQUANTIZATION_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuDequantizationKernel that dequantizes an input tensor */
-class CpuDequantization : public ICpuOperator
-{
-public:
-    /** Default Constructor */
-    CpuDequantization() = default;
-    /** Configure the kernel.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDequantization
-     *
-     * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] dst Destination tensor info. Data type supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZATION_H */
diff --git a/src/runtime/cpu/operators/CpuDequantize.cpp b/src/runtime/cpu/operators/CpuDequantize.cpp
new file mode 100644
index 0000000000..80a2e28aee
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDequantize.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuDequantize.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuDequantizeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    auto k = std::make_unique<kernels::CpuDequantizeKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::CpuDequantizeKernel::validate(src, dst);
+}
+
+void CpuDequantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    prepare(tensors);
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDequantize.h b/src/runtime/cpu/operators/CpuDequantize.h
new file mode 100644
index 0000000000..d1fb9e8d0e
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuDequantize.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H
+#define ARM_COMPUTE_CPU_DEQUANTIZE_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */
+class CpuDequantize : public ICpuOperator
+{
+public:
+    /** Default Constructor */
+    CpuDequantize() = default;
+    /** Configure the kernel.
+     *
+     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuDequantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */
diff --git a/src/runtime/cpu/operators/CpuQuantization.cpp b/src/runtime/cpu/operators/CpuQuantization.cpp
deleted file mode 100644
index ede13850e7..0000000000
--- a/src/runtime/cpu/operators/CpuQuantization.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/runtime/cpu/operators/CpuQuantization.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuQuantizationKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-Status CpuQuantization::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizationKernel::validate(src, dst));
-    return Status{};
-}
-
-void CpuQuantization::configure(ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Configure quantize kernel
-    auto k = std::make_unique<kernels::CpuQuantizationKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-void CpuQuantization::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuQuantization.h b/src/runtime/cpu/operators/CpuQuantization.h
deleted file mode 100644
index 97f0c5fa79..0000000000
--- a/src/runtime/cpu/operators/CpuQuantization.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_QUANTIZATION_H
-#define ARM_COMPUTE_CPU_QUANTIZATION_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to simulate a quantization layer. This function calls the following Arm(R) Neon(TM) kernels:
- *
- *
- * -# @ref kernels::CpuQuantizationKernel
- *
- */
-class CpuQuantization : public ICpuOperator
-{
-public:
-    /** Default Constructor */
-    CpuQuantization() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuQuantization
-     *
-     * @param[in] src Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] dst Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZATION_H */
diff --git a/src/runtime/cpu/operators/CpuQuantize.cpp b/src/runtime/cpu/operators/CpuQuantize.cpp
new file mode 100644
index 0000000000..5af7f6343b
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuQuantize.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/runtime/cpu/operators/CpuQuantize.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuQuantizeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst));
+    return Status{};
+}
+
+void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Configure quantize kernel
+    auto k = std::make_unique<kernels::CpuQuantizeKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+void CpuQuantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuQuantize.h b/src/runtime/cpu/operators/CpuQuantize.h
new file mode 100644
index 0000000000..09afffd920
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuQuantize.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_QUANTIZE_H
+#define ARM_COMPUTE_CPU_QUANTIZE_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */
+class CpuQuantize : public ICpuOperator
+{
+public:
+    /** Default Constructor */
+    CpuQuantize() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuQuantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */
diff --git a/src/runtime/gpu/cl/operators/ClDequantization.cpp b/src/runtime/gpu/cl/operators/ClDequantization.cpp
deleted file mode 100644
index df3203d2e1..0000000000
--- a/src/runtime/gpu/cl/operators/ClDequantization.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClDequantization.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClDequantizationKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClDequantization::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClDequantizationKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClDequantization::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClDequantizationKernel::validate(src, dst);
-}
-
-void ClDequantization::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDequantization.h b/src/runtime/gpu/cl/operators/ClDequantization.h
deleted file mode 100644
index a696b73d2e..0000000000
--- a/src/runtime/gpu/cl/operators/ClDequantization.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEQUANTIZATION_H
-#define ARM_COMPUTE_CL_DEQUANTIZATION_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClDequantizationKernel that dequantizes an input tensor */
-class ClDequantization : public IClOperator
-{
-public:
-    /** Constructor */
-    ClDequantization() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst             Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayer
-     *
-     * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] dst Output tensor info. Data type supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DEQUANTIZATION_H */
diff --git a/src/runtime/gpu/cl/operators/ClDequantize.cpp b/src/runtime/gpu/cl/operators/ClDequantize.cpp
new file mode 100644
index 0000000000..0c1391bb45
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClDequantize.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClDequantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    auto k = std::make_unique<kernels::ClDequantizeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClDequantizeKernel::validate(src, dst);
+}
+
+void ClDequantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDequantize.h b/src/runtime/gpu/cl/operators/ClDequantize.h
new file mode 100644
index 0000000000..47fad3eeee
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClDequantize.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H
+#define ARM_COMPUTE_CL_DEQUANTIZE_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */
+class ClDequantize : public IClOperator
+{
+public:
+    /** Constructor */
+    ClDequantize() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst             Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClDequantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */
diff --git a/src/runtime/gpu/cl/operators/ClQuantization.cpp b/src/runtime/gpu/cl/operators/ClQuantization.cpp
deleted file mode 100644
index 2e753b550e..0000000000
--- a/src/runtime/gpu/cl/operators/ClQuantization.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClQuantization.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClQuantizationKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClQuantization::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClQuantizationKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClQuantization::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClQuantizationKernel::validate(src, dst);
-}
-
-void ClQuantization::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClQuantization.h b/src/runtime/gpu/cl/operators/ClQuantization.h
deleted file mode 100644
index d938ff95a0..0000000000
--- a/src/runtime/gpu/cl/operators/ClQuantization.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_QUANTIZATION_H
-#define ARM_COMPUTE_CL_QUANTIZATION_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to quantize a tensor. This function calls the following OpenCL kernel:
- *
- * -# @ref kernels::ClQuantizationKernel
- */
-class ClQuantization : public IClOperator
-{
-public:
-    /** Constructor */
-    ClQuantization() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
-     * @param[out] dst             Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this function
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayer
-     *
-     * @param[in] src Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
-     * @param[in] dst Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} //namespace arm_compute
-#endif /* ARM_COMPUTE_CL_QUANTIZATION_H */
diff --git a/src/runtime/gpu/cl/operators/ClQuantize.cpp b/src/runtime/gpu/cl/operators/ClQuantize.cpp
new file mode 100644
index 0000000000..92bbb62ba5
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClQuantize.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClQuantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    auto k = std::make_unique<kernels::ClQuantizeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClQuantizeKernel::validate(src, dst);
+}
+
+void ClQuantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClQuantize.h b/src/runtime/gpu/cl/operators/ClQuantize.h
new file mode 100644
index 0000000000..0b6d2c8cbe
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClQuantize.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_QUANTIZE_H
+#define ARM_COMPUTE_CL_QUANTIZE_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */
+class ClQuantize : public IClOperator
+{
+public:
+    /** Constructor */
+    ClQuantize() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
+     * @param[out] dst             Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this function
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClQuantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_QUANTIZE_H */
-- 
cgit v1.2.1