151 files changed, 2804 insertions, 16250 deletions
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 909972482f..5b72354abe 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,37 +29,36 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
-    }
-    if(prev_output != nullptr && prev_output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, DataType::S32);
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
-        }
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+                                                             DataType::U64);
     }
 
     return Status{};
@@ -67,59 +66,70 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp
 } // namespace
 
 CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
-    : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor   *input,
+                                       ICLTensor         *output,
+                                       unsigned int       axis,
+                                       ReductionOperation op)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, prev_output, output, axis, op);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+                                       const ICLTensor        *input,
+                                       ICLTensor              *output,
+                                       unsigned int            axis,
+                                       ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape{ input->info()->tensor_shape() };
+    TensorShape output_shape{input->info()->tensor_shape()};
     output_shape.set(axis, 1);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(DataType::S32)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, op));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, prev_output, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
-    _prev_output    = prev_output;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
     // Set build options
-    const auto vector_size = (axis == 0) ? 16U : adjust_vec_size(16U, input->info()->dimension(0));
+    const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
+    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U)
+                                          ? 2U
+                                          : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
 
     CLBuildOptions build_opts;
-    build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % vector_size));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
     build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
     build_opts.add_option("-DDATA_TYPE_OUTPUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DCOND_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DUNROLL_WITH_PRAGMA=1");
 
     // Create kernel
-    cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
     std::string kernel_axis_name;
-    switch(axis)
+    switch (axis)
     {
         case 0:
-        {
-            const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
-            build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input_for_width->info()->dimension(0)));
-
+            build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
             kernel_axis_name = "x";
-            lws_hint         = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), vector_size);
-        }
-        break;
+            break;
         case 1:
             build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
             kernel_axis_name = "y";
@@ -139,15 +149,18 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
     _kernel = create_kernel(compile_context, "arg_min_max_" + kernel_axis_name, build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output->info()) : (*input->info()), Steps(vector_size));
-    ICLKernel::configure_internal(win, lws_hint);
+    Window win = calculate_max_window(*input->info(), Steps(vector_size));
+    ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+                                        const ITensorInfo *output,
+                                        unsigned int       axis,
+                                        ReductionOperation op)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
 }
 
@@ -156,43 +169,36 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
             // Set out window
             Window out_window(window);
+            Window in_window(window);
             out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+            in_window.set(Window::DimX,
+                          Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+            in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
 
             // Get first input and output slices
-            Window in_slice  = window.first_slice_window_2D();
+            Window in_slice  = in_window.first_slice_window_2D();
             Window out_slice = out_window.first_slice_window_2D();
-
-            // Reshape window
-            const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
-
-            // Set local sums buffer
-            unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
-            _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
             do
             {
                 unsigned int idx = 0;
                 add_2D_tensor_argument(idx, _input, in_slice);
-                if(_prev_output != nullptr)
-                {
-                    add_2D_tensor_argument(idx, _prev_output, in_slice);
-                }
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
         }
         break;
         case 1:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window window_in{window};
+            window_in.set(Window::DimY,
+                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
             Window in_slice  = window_in.first_slice_window_2D();
             Window out_slice = window.first_slice_window_2D();
 
@@ -202,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
         }
         break;
         case 2:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window window_in{window};
+            window_in.set(Window::DimZ,
+                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
             Window in_slice  = window_in.first_slice_window_3D();
             Window out_slice = window.first_slice_window_3D();
 
@@ -220,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
         }
         break;
         case 3:
         {
             // Get first input and output slices
-            Window window_in{ window };
+            Window window_in{window};
             window_in.set(3, Window::Dimension(0, 1, 1));
             Window in_slice  = window_in.first_slice_window_4D();
             Window out_slice = window.first_slice_window_4D();
@@ -238,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 929677f905..fb3b41b0de 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -56,48 +57,46 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input       Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                         Has to be nullptr for the first iteration
-     * @param[out] output      Destination tensor. Data types supported: U32/S32
-     *                         Output will have the same number of dimensions as input.
-     * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[out] output Destination tensor. Data types supported: U32/S32
+     *                    Output will have the same number of dimensions as input.
+     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in]  op     Reduction operation to perform. Only ArgMin and ArgMax are supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  prev_output     Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                             Has to be nullptr for the first iteration
      * @param[out] output          Destination tensor. Data types supported: U32/S32
      *                             Output will have the same number of dimensions as input.
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
      *
-     * @param[in] input       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32
-     *                        Has to be nullptr for the first iteration
-     * @param[in] output      Destination tensor info. Data types supported: U32/S32
-     *                        Output will have the same number of dimensions as input.
-     * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data types supported: U32/S32
+     *                   Output will have the same number of dimensions as input.
+     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in] op     Reduction operation to perform.  Only ArgMin and ArgMax are supported.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
     const ICLTensor   *_input;
-    const ICLTensor   *_prev_output;
     ICLTensor         *_output;
     unsigned int       _reduction_axis;
     ReductionOperation _op;
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 44bdc6f587..c88a852a44 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,49 +29,58 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *mean, const ITensorInfo *var,
-                          const ITensorInfo *beta, const ITensorInfo *gamma,
-                          float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
-    if(beta != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
     }
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
         ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
 
 std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
 
     bool window_changed = false;
-    if(output != nullptr)
+    if (output != nullptr)
     {
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
         window_changed = update_window_and_padding(win, input_access, output_access);
@@ -101,29 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+    : _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _beta(nullptr),
+      _gamma(nullptr),
+      _epsilon(0),
+      _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor          *input,
+                                                ICLTensor          *output,
+                                                const ICLTensor    *mean,
+                                                const ICLTensor    *var,
+                                                const ICLTensor    *beta,
+                                                const ICLTensor    *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                                const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *var,
+                                                const ICLTensor        *beta,
+                                                const ICLTensor        *gamma,
+                                                float                   epsilon,
+                                                ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
-    auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+    auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
     _input            = input;
     _output           = output;
     _mean             = mean;
@@ -138,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
                                                   mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
                                                   (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -153,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
 
     // Create kernel
-    _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel =
+        create_kernel(compile_context,
+                      "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                      build_opts.options());
 
     // Set kernel static arguments
     unsigned int include_output = (!_run_in_place) ? 1 : 0;
-    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
-    if(_beta != nullptr)
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() +
+                       2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    if (_beta != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip beta parameter
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
     }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
     // Configure kernel window
-    if(input->info()->data_layout() == DataLayout::NHWC)
+    if (input->info()->data_layout() == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -201,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
 }
 
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
-    if(input->data_layout() != DataLayout::NHWC)
+    if (input->data_layout() != DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
-                                    .first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+                .first);
     }
 
     return Status{};
@@ -232,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
-    if(_beta != nullptr)
+    if (_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _beta, vector_slice);
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _gamma, vector_slice);
     }
@@ -245,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index 743f4a9594..1a88d2a8c5 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +65,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -81,8 +89,15 @@ public:
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -98,10 +113,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index da41feb7b8..c640b5a8d6 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,10 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -52,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -64,14 +71,12 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -80,9 +85,9 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 }
 } // namespace
 
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
@@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    auto padding_info = get_padding_info({ input, block_shape, output });
+    auto padding_info = get_padding_info({input, block_shape, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
 
@@ -102,66 +110,83 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     _block_shape = block_shape;
     _output      = output;
 
-    const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
-    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
+    _kernel = create_kernel(compile_context,
+                            "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+                                          const int32_t    block_shape_x,
+                                          const int32_t    block_shape_y,
+                                          ICLTensor       *output,
+                                          const CropInfo  &crop_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int32_t           block_shape_x,
+                                          const int32_t           block_shape_y,
+                                          ICLTensor              *output,
+                                          const CropInfo         &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input  = input;
     _output = output;
 
-    const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
-    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
     build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
     build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
-    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
+    build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
+    _kernel = create_kernel(
+        compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           const int32_t      block_shape_x,
+                                           const int32_t      block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
     return Status{};
 }
 
@@ -170,32 +195,31 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice_in  = window.first_slice_window_3D();
-    Window slice_out = window.first_slice_window_4D();
+    Window slice_out = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_4D();
 
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    slice_out.set(3, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_in.set(3, Window::Dimension(0, 0, 0));
 
     int batch_id = 0;
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
+        add_4D_tensor_argument(idx, _input, slice_in);
         add_argument(idx, batch_id);
-        if(_block_shape != nullptr)
+        if (_block_shape != nullptr)
         {
             add_1D_tensor_argument(idx, _block_shape, vector_slice);
         }
-        add_4D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_in, lws_hint());
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_in));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index 131a43e59c..b9d3e66fe2 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -52,6 +53,8 @@ public:
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
     /** Initialise the kernel's inputs and output.
@@ -60,16 +63,26 @@ public:
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int32_t    block_shape_x,
+                   const int32_t    block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -77,8 +90,14 @@ public:
      * @param[in]  block_shape_x   Block shape x value.
      * @param[in]  block_shape_y   Block shape y value.
      * @param[out] output          Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info       Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int32_t           block_shape_x,
+                   const int32_t           block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -86,6 +105,8 @@ public:
      * @param[in] output      Tensor output. Data types supported: same as @p input
      *
      * @return a status
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel (Static block shape).
@@ -94,10 +115,15 @@ public:
      * @param[in] block_shape_x Block shape x value.
      * @param[in] block_shape_y Block shape y value.
      * @param[in] output        Tensor output. Data types supported: same as @p input
+     * @param[in] crop_info     Specifies how the output shape is cropped after batch to space is performed
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int32_t      block_shape_x,
+                           const int32_t      block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index b1f7c00fac..de3fb43de8 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,23 +27,30 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLBitwiseKernel::CLBitwiseKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input1,
+                                const ICLTensor        *input2,
+                                ICLTensor              *output,
+                                BitwiseOperation        op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    if(op != BitwiseOperation::NOT)
+    if (op != BitwiseOperation::NOT)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
@@ -53,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*(output->info()), *(input1->info()));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     // Configure kernel window
     const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
@@ -65,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Create kernel
     std::string kernel_name = "";
-    switch(op)
+    switch (op)
     {
         case BitwiseOperation::AND:
             kernel_name = "bitwise_and";
@@ -104,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input1, slice);
-        if(_input2 != nullptr)
+        if (_input2 != nullptr)
         {
             add_2D_tensor_argument(idx, _input2, slice);
         }
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
index c5a999643d..2c74955ae4 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.h
+++ b/src/core/CL/kernels/CLBitwiseKernel.h
@@ -59,7 +59,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  op              Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   BitwiseOperation        op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 2c12275e57..f32c518e29 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -41,7 +41,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -54,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
 
     const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
-    if(is_qasymm16)
+    if (is_qasymm16)
     {
         const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -66,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(is_qasymm16)
+        if (is_qasymm16)
         {
             const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -84,21 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
 }
 } // namespace
 
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
-    : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext         &compile_context,
+                                             const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
-    auto padding_info = get_padding_info({ boxes, pred_boxes, deltas });
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
@@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
     build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
     build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
 
-    if(is_quantized)
+    if (is_quantized)
     {
         build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
         const UniformQuantizationInfo boxes_qinfo      = boxes->info()->quantization_info().uniform();
@@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
 
     // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
     const unsigned int num_elems_processed_per_iteration = 4;
-    Window             win                               = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+    Window             win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 08f350e86a..9a1bb49bb9 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -58,7 +58,10 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -71,7 +74,11 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -85,7 +92,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index c969792c3e..ec58bf9e7a 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     // There cannot be more groups than channels
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -66,28 +74,42 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const bool             is_nhwc                             = input->data_layout() == DataLayout::NHWC;
-    const unsigned int     num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
-    constexpr unsigned int num_elems_processed_per_iteration_y = 2;
+    const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+    if (is_nhwc)
+    {
+        unsigned int num_elems_processed_per_iteration_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+        Window win_collapsed = win.collapse(win, Window::DimZ);
+        return std::make_pair(Status{}, win_collapsed);
+    }
+    else
+    {
+        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / input->element_size();
+        constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
-    // Configure kernel window
-    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        // Configure kernel window
+        Window win = calculate_max_window(
+            *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+                                           num_elems_processed_per_iteration_y);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+                                            num_elems_processed_per_iteration_y);
 
-    const bool window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
+        const bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    Window win_collapsed = win.collapse(win, Window::DimZ);
+        Window win_collapsed = win.collapse(win, Window::DimZ);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
+        Status err =
+            (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        return std::make_pair(err, win_collapsed);
+    }
 }
 } // namespace
 
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
@@ -95,27 +117,42 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+                                            const ICLTensor        *input,
+                                            ICLTensor              *output,
+                                            unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-
     const DataLayout   data_layout = input->info()->data_layout();
     const bool         is_nhwc     = data_layout == DataLayout::NHWC;
-    const unsigned int channels    = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    const unsigned int vec_size    = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
+    const unsigned int channels =
+        input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    unsigned int vec_size_x           = 0;
+    unsigned int vec_size_x_leftovers = 0;
+    if (is_nhwc)
+    {
+        vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+        vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    }
+    else
+    {
+        vec_size_x = max_cl_vector_width / input->info()->element_size();
+    }
 
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
     build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(is_nhwc, "-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
+    build_opts.add_option_if(is_nhwc, "-DSRC_DIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
-    build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
 
     // Create kernel
@@ -146,9 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    if (data_layout == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+    }
 }
 
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 31c007f17e..43c939ebd8 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -60,7 +60,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
deleted file mode 100644
index 44b8471725..0000000000
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLCol2ImKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, true, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
-
-    constexpr unsigned int num_elems_read_per_iteration = 8;
-
-    // Configure window
-    Window win = calculate_max_window(*input, Steps(num_elems_read_per_iteration));
-
-    // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
-    AccessWindowHorizontal input_access(input, 0, num_elems_read_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access);
-
-    Coordinates coord;
-    coord.set_num_dimensions(output->num_dimensions());
-    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLCol2ImKernel::CLCol2ImKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims()
-{
-}
-
-void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, convolved_dims, num_groups);
-}
-
-void CLCol2ImKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims, num_groups));
-
-    _input          = input;
-    _output         = output;
-    _convolved_dims = convolved_dims;
-
-    const DataType data_type = input->info()->data_type();
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
-    build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
-    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-
-    _kernel = create_kernel(compile_context, "col2im", build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims, num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "col2im_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(num_groups);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims, num_groups).first);
-    return Status{};
-}
-
-void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    bool is_collapsed     = false;
-    bool is_collapsed_out = false;
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Window collapsed     = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
-    Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
-
-    ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
-
-    Window slice     = collapsed.first_slice_window_3D();
-    Window slice_out = collapsed_out.first_slice_window_4D();
-    do
-    {
-        // Set inputs
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_4D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCol2ImKernel.h b/src/core/CL/kernels/CLCol2ImKernel.h
deleted file mode 100644
index 710e048bca..0000000000
--- a/src/core/CL/kernels/CLCol2ImKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOL2IMKERNEL_H
-#define ARM_COMPUTE_CLCOL2IMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class CLCol2ImKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCol2ImKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel(const CLCol2ImKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel(CLCol2ImKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
-    /** Default destructor */
-    ~CLCol2ImKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims Output convolved dimensions.
-     * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                             while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims  Output convolved dimensions.
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
-     *
-     * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in] convolved_dims Output convolved dimensions.
-     * @param[in] num_groups     (Optional) Number of groups when performing a grouped convolution
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _convolved_dims;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCOL2IMKERNEL_H */
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index e2aee36bd8..a0f9aca54a 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,9 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,22 +40,16 @@ namespace arm_compute
 namespace
 {
 // Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
-    { ComparisonOperation::Equal, "EQUAL" },
-    { ComparisonOperation::NotEqual, "NOTEQUAL" },
-    { ComparisonOperation::Greater, "GREATER" },
-    { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
-    { ComparisonOperation::Less, "LESS" },
-    { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+    {ComparisonOperation::Equal, "EQUAL"},     {ComparisonOperation::NotEqual, "NOTEQUAL"},
+    {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+    {ComparisonOperation::Less, "LESS"},       {ComparisonOperation::LessEqual, "LESSEQUAL"},
 };
 
-int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
-{
-    return 16 / input.element_size();
-}
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo  &input1,
+                          const ITensorInfo  &input2,
+                          const ITensorInfo  &output,
+                          ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
     ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -63,7 +60,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured output
-    if(output.total_size() > 0)
+    if (output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -75,45 +72,37 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
-    const TensorShape &out_shape    = broadcast_pair.first;
-    const ValidRegion &valid_region = broadcast_pair.second;
-
-    const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input1.element_size(), output.dimension(0));
 
     // Auto initialize output if not initialized
     auto_init_if_empty(output, out_shape, 1, DataType::U8, QuantizationInfo());
 
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
-    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+    Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
 
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
 }
 } // namespace
 
-CLComparisonKernel::CLComparisonKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor    *input1,
+                                   const ICLTensor    *input2,
+                                   ICLTensor          *output,
+                                   ComparisonOperation operation)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input1,
+                                   const ICLTensor        *input2,
+                                   ICLTensor              *output,
+                                   ComparisonOperation     operation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -129,17 +118,29 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     const std::string &operation_name = supported_comparison_ops.at(operation);
     std::string        kernel_name    = "compare_" + lower_string(operation_name);
 
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input1->info()->element_size(), output->info()->dimension(0));
+
     // Set kernel build options
     std::set<std::string> build_opts;
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("-DVEC_SIZE_LEFTOVER=" +
+                       support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.emplace(
+        "-DVEC_SIZE_IN1=" + //
+        support::cpp11::to_string(input1->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.emplace(
+        "-DVEC_SIZE_IN2=" + //
+        support::cpp11::to_string(input2->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
     build_opts.emplace("-DOP=" + operation_name);
     build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
-    if(is_data_type_quantized(input1->info()->data_type()))
+    if (is_data_type_quantized(input1->info()->data_type()))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
 
+        build_opts.emplace("-DIS_QUANTIZED");
         build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
         build_opts.emplace("-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
         build_opts.emplace("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -163,12 +164,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
 }
 
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo  *input1,
+                                    const ITensorInfo  *input2,
+                                    const ITensorInfo  *output,
+                                    ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -184,17 +189,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -215,16 +221,7 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
-BorderSize CLComparisonKernel::border_size() const
-{
-    const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
-
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
-}
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
index 0b94190183..2fb4ba06b6 100644
--- a/src/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
-#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  operation       Comparison operation to use.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
      *
      * @param[in] input1    Source tensor. Data types supported: All.
@@ -74,11 +79,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
 
 private:
     const ICLTensor *_input1; /**< Source tensor 1 */
@@ -86,4 +93,4 @@ private:
     ICLTensor       *_output; /**< Destination tensor */
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOMPARISONKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index dcf4e6662e..0000000000
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-CLConvertFullyConnectedWeightsKernel::CLConvertFullyConnectedWeightsKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLConvertFullyConnectedWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                                     DataLayout data_layout)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
-}
-
-void CLConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                                     DataLayout data_layout)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialisation if not yet initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-
-    auto padding_info = get_padding_info({ input, output });
-
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
-
-    _input  = input;
-    _output = output;
-
-    const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
-    const int width_idx   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
-    const int height_idx  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
-    const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
-
-    const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
-    const unsigned int num_channels              = original_input_shape[channel_idx];
-
-    const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels;
-    const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane;
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-    build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
-    build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                      DataLayout data_layout)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-
-void CLConvertFullyConnectedWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input, window);
-    add_2D_tensor_argument(idx, _output, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index d1da793df2..0000000000
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class CLConvertFullyConnectedWeightsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLConvertFullyConnectedWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvertFullyConnectedWeightsKernel(const CLConvertFullyConnectedWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvertFullyConnectedWeightsKernel &operator=(const CLConvertFullyConnectedWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvertFullyConnectedWeightsKernel(CLConvertFullyConnectedWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvertFullyConnectedWeightsKernel &operator=(CLConvertFullyConnectedWeightsKernel &&) = default;
-    /** Default destructor */
-    ~CLConvertFullyConnectedWeightsKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Set the input and output tensor.
-     *
-     * @param[in]  compile_context      The compile context to be used.
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel
-     *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
-     * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index d28cffa05f..f8ecc4c098 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -36,9 +38,11 @@ namespace arm_compute
 CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
     : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *output,
                                                     const PadStrideInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -58,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
-    for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
@@ -66,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     return Status{};
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   ICLTensor              *output,
+                                                   const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input       = input;
     _output      = output;
@@ -96,7 +101,6 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compi
     // Configure kernel window
     Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
@@ -118,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
     const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -136,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+            } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
             break;
         }
         case DataLayout::NHWC:
@@ -155,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+            } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
             break;
         }
         default:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index e0d1322341..762989a836 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index ca7e9d4b23..b33e0a8b6f 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,9 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,7 +39,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *bias,
+                          const ITensorInfo   *output,
+                          const ITensorInfo   *input_info,
+                          const ITensorInfo   *weights_info,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
-    if(!is_qasymm)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S32);
+    if (!is_qasymm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+                                                           weights_info->dimension(idx_b));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_qasymm)
+        if (is_qasymm)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-        auto                out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+        auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+                                                        stride_info);
 
-        const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo   *input,
+                                                        ITensorInfo         *output,
+                                                        const ITensorInfo   *input_info,
+                                                        const ITensorInfo   *weights_info,
+                                                        const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     const size_t        idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
 
-    auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
 
-    const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+    auto_init_if_empty(*output, input->clone()
+                                    ->set_tensor_shape(output_shape)
+                                    .set_data_layout(data_layout)
+                                    .set_quantization_info(input->quantization_info()));
 
     Window win = calculate_max_window(*input);
 
@@ -109,28 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 }
 } // namespace
 
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
-    : _add_bias(false),
-      _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor     *input,
+                                                   const ICLTensor     *bias,
+                                                   ICLTensor           *output,
+                                                   const ITensorInfo   *input_info,
+                                                   const ITensorInfo   *weights_info,
                                                    const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
-                                                   const ITensorInfo   *weights_info,
-                                                   const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   const ICLTensor        *bias,
+                                                   ICLTensor              *output,
+                                                   const ITensorInfo      *input_info,
+                                                   const ITensorInfo      *weights_info,
+                                                   const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+                                                  output->info(), input_info, weights_info, deconv_info));
 
-    auto padding_info = get_padding_info({ input, bias, output });
+    auto padding_info = get_padding_info({input, bias, output});
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+    auto win_config =
+        validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     const DataLayout data_layout = input_info->data_layout();
@@ -177,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *bias,
+                                                    const ITensorInfo   *output,
+                                                    const ITensorInfo   *input_info,
+                                                    const ITensorInfo   *weights_info,
                                                     const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -193,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, collapsed);
     add_3D_tensor_argument(idx, _output, collapsed);
-    if(_add_bias)
+    if (_add_bias)
     {
         add_1D_tensor_argument(idx, _bias, collapsed);
     }
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index ce354fa86f..8f436b07e3 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -67,7 +67,12 @@ public:
      * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const ITensorInfo   *input_info,
+                   const ITensorInfo   *weights_info,
+                   const PadStrideInfo &deconv_info);
     /** Initialise the kernel's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -79,8 +84,13 @@ public:
      * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
-                   const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const ITensorInfo      *input_info,
+                   const ITensorInfo      *weights_info,
+                   const PadStrideInfo    &deconv_info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
      *
@@ -93,7 +103,12 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const ITensorInfo   *input_info,
+                           const ITensorInfo   *weights_info,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
deleted file mode 100644
index c98d66f390..0000000000
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input == output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), "Input and output data types must be different");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()) && shift != 0, "Shift is used only with integer non-quantized inputs");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && shift != 0, "Shift is used only with integer non-quantized inputs");
-    ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
-
-    // Validate in case of configured output
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
-}
-
-void CLDepthConvertLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    _input  = input;
-    _output = output;
-
-    // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
-
-    auto padding_info = get_padding_info({ input, output });
-
-    // Get data sizes
-    const size_t input_size  = data_size_from_type(input->info()->data_type());
-    const size_t output_size = data_size_from_type(output->info()->data_type());
-
-    // Get number of elements to process per iterations
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-    // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
-    build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || is_data_type_float(output->info()->data_type()), "-DIS_DATA_TYPE_FLOAT");
-    build_opts.add_option_if(is_data_type_quantized(input->info()->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
-
-    // Create kernel
-    const std::string kernel_name = (input_size >= output_size) ? "convert_depth_down" : "convert_depth_up";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set shift arg
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
-    _kernel.setArg(idx++, shift);
-
-    // Since we have a leftover vector size calculated using the input tensor shape, it is required to
-    // have the input region equal to the tensor shape
-    ValidRegion input_valid_region = input->info()->valid_region();
-    input->info()->set_valid_region(ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
-
-    // Configure kernel
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Collapse window
-    const Window &full_window      = window();
-    Window        collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
-    ICLKernel::configure_internal(collapsed_window);
-
-    // Restore the valid region
-    input->info()->set_valid_region(input_valid_region);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
-
-    return Status{};
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.h b/src/core/CL/kernels/CLDepthConvertLayerKernel.h
deleted file mode 100644
index 8b511c6707..0000000000
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple3DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth conversion kernel. */
-class CLDepthConvertLayerKernel : public ICLSimple3DKernel
-{
-public:
-    /** Set the input and output of the kernel.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy Conversion policy
-     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
-    /** Set the input and output of the kernel.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] output          The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy
-     * @param[in]  shift           Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in] policy Conversion policy
-     * @param[in] shift  Value for down/up conversions. Must be 0 <= shift < 8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 8946f2a713..cdf19ab2e1 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -48,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -62,9 +66,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -72,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
 
@@ -96,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -135,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_in, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_in));
+    } while (window.slide_window_slice_3D(slice_in));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1f7f77b569..cef70c4dda 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
deleted file mode 100644
index ba7a782bf1..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D dilation,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                    "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
-
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
-    const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
-
-    if(biases != nullptr)
-    {
-        if(is_qasymm)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON((biases->dimension(0) != weights->dimension(2)) && (weights->dimension(2) != 1 || biases->dimension(0) != weights->dimension(3)));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if(is_qasymm)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-
-        if(is_data_type_quantized_per_channel(weights->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-            ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_multipliers->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_shifts->dimension(0));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-            ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                        unsigned int depth_multiplier, GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation)
-{
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
-    const unsigned int conv_stride_x = conv_info.stride().first;
-    const unsigned int conv_stride_y = conv_info.stride().second;
-    const bool         is_qasymm     = is_data_type_quantized_asymmetric(input->data_type());
-    const bool         is_bifrost    = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
-    // Configure kernel window
-    unsigned int num_elems_read_per_iteration_x    = 0;
-    unsigned int num_elems_read_per_iteration_y    = 0;
-    unsigned int num_elems_written_per_iteration_x = 0;
-    unsigned int num_elems_written_per_iteration_y = 0;
-
-    if(input->data_type() == DataType::F16)
-    {
-        kernel_name                       = "depthwise_convolution_3x3_f16";
-        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
-        num_elems_written_per_iteration_y = 1;
-        num_elems_read_per_iteration_y    = 3;
-        switch(conv_stride_x)
-        {
-            case 1:
-                num_elems_read_per_iteration_x = 8;
-                break;
-            case 2:
-                num_elems_read_per_iteration_x = 9;
-                break;
-            case 3:
-                num_elems_read_per_iteration_x = 16;
-                break;
-            default:
-                num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
-                break;
-        }
-        if(is_bifrost)
-        {
-            if(conv_stride_x == 1 && conv_stride_y == 1)
-            {
-                kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16";
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_read_per_iteration_y    = 6;
-                num_elems_written_per_iteration_y = 4;
-            }
-            else if(conv_stride_x == 2 && conv_stride_y == 2)
-            {
-                kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16";
-                num_elems_read_per_iteration_x    = 10;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_read_per_iteration_y    = 5;
-                num_elems_written_per_iteration_y = 2;
-            }
-        }
-    }
-    else if(input->data_type() == DataType::F32 && is_bifrost)
-    {
-        if(conv_stride_x == 1 && conv_stride_y == 1)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32";
-            num_elems_read_per_iteration_x    = 4;
-            num_elems_read_per_iteration_y    = 6;
-            num_elems_written_per_iteration_x = 2;
-            num_elems_written_per_iteration_y = 4;
-        }
-        else if(conv_stride_x == 2 && conv_stride_y == 2)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32";
-            num_elems_read_per_iteration_x    = 6;
-            num_elems_read_per_iteration_y    = 5;
-            num_elems_written_per_iteration_x = 2;
-            num_elems_written_per_iteration_y = 2;
-        }
-        else
-        {
-            kernel_name                       = "depthwise_convolution_3x3";
-            num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
-            num_elems_written_per_iteration_y = 1;
-            num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
-            num_elems_read_per_iteration_y    = 3;
-        }
-    }
-    else
-    {
-        const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_data_type_quantized_per_channel(weights->data_type());
-
-        kernel_name = is_qasymm ? "dwc_3x3_native_quantized8" : "depthwise_convolution_3x3";
-        kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
-        kernel_name += (is_qasymm ? "_nchw" : "");
-
-        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
-        num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1 && dilation.y() == 1) ? 2 : 1;
-        num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x + (conv_stride_x > 1 ? 1 : 0);
-        num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
-    }
-    // The OpenCL routine convolution1x3 does loadn(addr), loadn(addr + dilation_x) and loadn(addr + 2 * dilation_x) on the input.
-    // Each of the three convolution1x3 gets called by passing addr, (addr + dilation_y) and (addr + 2 * dilation_y)
-    // Hence we must add 2 * dilation.x/y() to the number of elements read in those axes per thread
-    num_elems_read_per_iteration_x += 2 * dilation.x();
-    num_elems_read_per_iteration_y += 2 * dilation.y();
-
-    // Create window and update padding
-    Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-    AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
-                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                                       conv_stride_x, conv_stride_y);
-    AccessWindowStatic    weights_access(weights, 0, 0, 3, 3);
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
-    bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NCHWKernel::CLDepthwiseConvolutionLayer3x3NCHWKernel()
-    : _conv_stride_x(0), _conv_pad_top(0), _conv_pad_left(0)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
-                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts);
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
-                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  conv_info, depth_multiplier, act_info, dilation,
-                                                  (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
-                                                  (output_shifts != nullptr) ? output_shifts->info() : nullptr));
-
-    _input              = input;
-    _output             = output;
-    _weights            = weights;
-    _biases             = biases;
-    _conv_stride_x      = conv_info.stride().first;
-    _conv_stride_y      = conv_info.stride().second;
-    _conv_pad_left      = conv_info.pad_left();
-    _conv_pad_top       = conv_info.pad_top();
-    _output_multipliers = output_multipliers;
-    _output_shifts      = output_shifts;
-    _is_quantized       = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    // Configure kernel window
-    std::string     kernel_name;
-    const GPUTarget gpu_target = get_target();
-
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    _border_size = BorderSize(input->info()->padding());
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
-    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
-    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
-        const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
-        const bool is_dot8_supported        = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
-        build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
-        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
-        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
-        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset));
-        build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-        build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
-
-        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
-        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
-        if(act_info.enabled())
-        {
-            int a_val{};
-            int b_val{};
-            std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, input->info()->data_type(), oq_info);
-
-            const int o1 = oq_info.offset;
-
-            build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
-            build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
-            build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
-            const float s1 = iq_info.scale;
-            build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
-            build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
-        }
-
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
-        build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
-    }
-    else
-    {
-        build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-        build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        build_opts.add_option_if(act_info.enabled(), "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(win_config.second.x().step()));
-    }
-
-    build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
-    build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
-
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target,
-                                                          const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    std::string kernel_name;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(),
-                                                              conv_info, depth_multiplier, gpu_target, kernel_name, dilation)
-                                .first);
-
-    return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    // Create input window and adjust
-    Window collapsed_in = collapsed;
-    collapsed_in.adjust(Window::DimX, -_conv_pad_left, true);
-    collapsed_in.adjust(Window::DimY, -_conv_pad_top, true);
-    collapsed_in.set_dimension_step(Window::DimX, collapsed_in.x().step() * _conv_stride_x);
-    collapsed_in.set_dimension_step(Window::DimY, collapsed_in.y().step() * _conv_stride_y);
-
-    Window slice_in      = collapsed_in.first_slice_window_3D();
-    Window slice_out     = collapsed.first_slice_window_3D();
-    Window slice_weights = window.first_slice_window_3D();
-    slice_weights.set_dimension_step(Window::DimX, 0);
-    slice_weights.set_dimension_step(Window::DimY, 0);
-
-    unsigned int idx = 3 * num_arguments_per_3D_tensor();
-
-    // Set output multipliers in case of quantized data type
-    if(_is_quantized)
-    {
-        Window slice;
-        slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape());
-        add_1D_tensor_argument(idx, _output_multipliers, slice);
-        add_1D_tensor_argument(idx, _output_shifts, slice);
-    }
-
-    // Set biases
-    if(_biases != nullptr)
-    {
-        Window slice_biases;
-        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        add_1D_tensor_argument(idx, _biases, slice_biases);
-    }
-
-    do
-    {
-        idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_weights);
-
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
deleted file mode 100644
index 45b5869676..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-
-#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NCHW.
- */
-class CLDepthwiseConvolutionLayer3x3NCHWKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel();
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel
-     *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [3, 3, IFM].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] gpu_target         (Optional) GPU target to validate the kernel for. Defaults to midgard.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD,
-                           const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int _conv_stride_x;
-    unsigned int _conv_pad_top;
-    unsigned int _conv_pad_left;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
deleted file mode 100644
index d13afd2010..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ /dev/null
@@ -1,464 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                    "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
-
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 4);
-
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
-    const bool   is_qasymm      = is_data_type_quantized_asymmetric(input->data_type());
-    const size_t weights_width  = 3;
-    const size_t weights_height = 3;
-
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
-                                         *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
-    if(is_qasymm)
-    {
-        DepthwiseConvolutionReshapeInfo info;
-        info.c0 = 4;
-        ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(0) / info.c0) != weights_width * weights_height);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-
-        if(is_data_type_quantized_per_channel(weights->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_multipliers->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_shifts->dimension(0));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-            ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-        ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height));
-    }
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[0]);
-        if(is_qasymm)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
-                                                        const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_UNUSED(weights);
-    ARM_COMPUTE_UNUSED(depth_multiplier);
-
-    const bool   is_stride_1_dilation_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
-    unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-
-    Window win{};
-    Status err{};
-
-    if(is_data_type_quantized_asymmetric(input->data_type()))
-    {
-        const unsigned int num_elems_accessed_per_iteration = 4;
-        const unsigned int num_rows_read_per_iteration      = num_rows_processed_per_iteration + 2;
-        const unsigned int num_rows_written_per_iteration   = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
-
-        BorderSize border_size;
-        border_size = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
-
-        // Configure kernel window
-        win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration));
-
-        AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
-                                        ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
-
-        bool window_changed = false;
-
-        if((output_multipliers != nullptr) && (output_shifts != nullptr))
-        {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_accessed_per_iteration);
-            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_accessed_per_iteration);
-            window_changed = window_changed || update_window_and_padding(win, input_access, output_access, output_multipliers_access, output_shifts_access);
-        }
-        else
-        {
-            Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input");
-            return std::make_pair(err, win);
-        }
-
-        if(bias != nullptr)
-        {
-            AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration);
-            window_changed = window_changed || update_window_and_padding(win, bias_access);
-        }
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-        err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    }
-    else
-    {
-        unsigned int num_elems_accessed_per_iteration = adjust_vec_size(4 / input->element_size(), input->dimension(0));
-        win                                           = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_processed_per_iteration));
-    }
-
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NHWCKernel::CLDepthwiseConvolutionLayer3x3NHWCKernel()
-    : _num_planes_processed_per_iteration(1)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3NHWCKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
-                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts);
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
-                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  conv_info, depth_multiplier, act_info, dilation,
-                                                  (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
-                                                  (output_shifts != nullptr) ? output_shifts->info() : nullptr));
-
-    auto padding_info = get_padding_info({ input, weights, biases, output });
-
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-                                                    conv_info, depth_multiplier, dilation,
-                                                    (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
-                                                    (output_shifts != nullptr) ? output_shifts->info() : nullptr);
-
-    const bool is_stride_1              = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool is_stride_1_dilation_1   = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
-    const bool is_dot8_supported        = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
-
-    _input                              = input;
-    _output                             = output;
-    _weights                            = weights;
-    _biases                             = biases;
-    _conv_stride_y                      = conv_info.stride().second;
-    _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-    _output_multipliers                 = output_multipliers;
-    _output_shifts                      = output_shifts;
-    _is_quantized                       = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    if(_is_quantized)
-    {
-        _border_size = BorderSize(input->info()->padding());
-
-        // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
-        if(is_dot8_supported)
-        {
-            _num_planes_processed_per_iteration = 1;
-        }
-    }
-
-    unsigned int num_elems_accessed_per_iteration = _is_quantized ? 4 : adjust_vec_size(4 / input->info()->element_size(), input->info()->dimension(0));
-    unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
-    build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
-    build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_accessed_per_iteration));
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-    build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
-                             "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
-
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
-        build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
-        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
-        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
-        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset));
-        build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-        build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
-
-        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
-        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
-        if(act_info.enabled())
-        {
-            int a_val{};
-            int b_val{};
-            std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, input->info()->data_type(), oq_info);
-
-            const int o1 = oq_info.offset;
-
-            build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
-            build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
-            build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
-            const float s1 = iq_info.scale;
-            build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
-            build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
-        }
-
-        build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
-        build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
-    }
-    else
-    {
-        build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-        build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-    }
-
-    if(is_stride_1_dilation_1)
-    {
-        build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(num_rows_processed_per_iteration));
-        build_opts.add_option("-DNUM_PLANES_PROCESSED=" + support::cpp11::to_string(_num_planes_processed_per_iteration));
-        build_opts.add_option("-DDST_DIM_1=" + support::cpp11::to_string(_output->info()->dimension(1)));
-        build_opts.add_option("-DDST_DIM_2=" + support::cpp11::to_string(_output->info()->dimension(2)));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string((input->info()->dimension(1) + conv_info.pad_left() + conv_info.pad_right()) % num_rows_processed_per_iteration));
-    }
-    else
-    {
-        build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-        build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
-        build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-        build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    }
-
-    std::string kernel_name;
-    // Create kernel
-    if(_is_quantized)
-    {
-        kernel_name = std::string("dwc_3x3_reshaped_quantized8");
-        kernel_name += (is_dot8_supported && is_stride_1_dilation_1 ? "_dot8" : "");
-        kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
-        kernel_name += "_nhwc";
-    }
-    else
-    {
-        kernel_name = std::string("depthwise_convolution_3x3_nhwc");
-        kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
-    }
-
-    ICLKernel::configure_internal(win_config.second);
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    ARM_COMPUTE_ERROR_ON(!_is_quantized && has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += string_from_data_type(input->info()->data_type());
-}
-
-Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
-                                                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
-                                                              biases != nullptr ? biases->clone().get() : nullptr,
-                                                              output->clone().get(), conv_info, depth_multiplier, dilation,
-                                                              (output_multipliers != nullptr) ? output_multipliers->clone().get() : nullptr,
-                                                              (output_shifts != nullptr) ? output_shifts->clone().get() : nullptr)
-                                .first);
-    return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
-
-    Window win = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
-
-    unsigned int idx = 2 * num_arguments_per_4D_tensor() + (_is_quantized ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
-
-    if(_is_quantized)
-    {
-        Window slice;
-        slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape());
-        slice.set_dimension_step(Window::DimX, window.x().step());
-        add_1D_tensor_argument(idx, _output_multipliers, slice);
-        add_1D_tensor_argument(idx, _output_shifts, slice);
-    }
-
-    if(_biases != nullptr)
-    {
-        Window win_biases;
-        win_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        win_biases.set_dimension_step(Window::DimX, window.x().step());
-        add_1D_tensor_argument(idx, _biases, win_biases);
-    }
-
-    if(_is_quantized)
-    {
-        // Calculate the max_offset.
-        // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
-        //  |******************|
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane0      |
-        //  |      batch0      |
-        //  |__________________|
-        //  |******************|       Batch 0
-        //  |    pad_bottom    |
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane1      |
-        //  |      batch0      |
-        //  |__________________|-----> max_offset
-        //  |******************|
-        //  |    pad_bottom    |
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane0      |
-        //  |      batch1      |
-        //  |__________________|
-        //  |******************|       Batch 1
-        //  |    pad_bottom    |
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane1      |
-        //  |      batch1      |
-        //  |__________________|
-        //  |     pad_bottom   |
-        //  |******************|
-        const int max_offset = ((_input->info()->dimension(1) * _input->info()->dimension(2)) + (_input->info()->padding().bottom + _input->info()->padding().top) * (_input->info()->dimension(
-                                    2) - 1)) * _input->info()->strides_in_bytes().y();
-        _kernel.setArg(idx, max_offset);
-    }
-
-    Window slice = win.first_slice_window_4D();
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, slice);
-        add_4D_tensor_argument(idx, _output, slice);
-        if(_is_quantized)
-        {
-            add_2D_tensor_argument(idx, _weights, slice);
-        }
-        else
-        {
-            add_3D_tensor_argument(idx, _weights, slice);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(win.slide_window_slice_4D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
deleted file mode 100644
index ce0bf5ceb3..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-
-#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NHWC.
- */
-class CLDepthwiseConvolutionLayer3x3NHWCKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel();
-    /** Default move assignment operator. */
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-     *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int _num_planes_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index c34018a000..b95abe795f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,46 +28,94 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                          const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo          *input,
+                          const ITensorInfo          *weights,
+                          const ITensorInfo          *biases,
+                          const ITensorInfo          *output,
+                          const DWCComputeKernelInfo &dwc_info,
+                          const ConvolutionInfo      &conv_info,
+                          const ITensorInfo          *output_multipliers,
+                          const ITensorInfo          *output_shifts)
 {
     ARM_COMPUTE_UNUSED(dwc_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    bool in_place = false;
+    if (output == nullptr || output == input)
+    {
+        in_place = true;
+        output   = input;
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1 && dwc_weights_info.n0 != 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().second < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+                                        (export_to_cl_image(weights) == false),
+                                    "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1));
     const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(idx_c);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
+
+    // In place restrictions
+    if (in_place)
+    {
+        const int weights_width_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+        const int weights_height_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+                                    weights->tensor_shape()[weights_height_idx] != 1U);
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            conv_info.pad_stride_info
+                .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+    }
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+    const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+                               conv_info.dilation};
+    const TensorShape     output_shape =
+        arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+
+    if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
+    }
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
 
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -77,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         }
     }
 
-    if(is_quantized)
+    if (is_quantized)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -85,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
 
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -103,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    if(is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized(input->data_type()))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+        const UniformQuantizationInfo oq_info =
+            (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
 
         float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
 
     return Status{};
@@ -133,110 +183,194 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
       _depth_multiplier(1),
       _output_multipliers(nullptr),
       _output_shifts(nullptr),
+      _export_input_to_cl_image(false),
+      _export_weights_to_cl_image(false),
       _is_quantized(false)
 {
+    _type = CLKernelType::DEPTHWISE;
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                                                        const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+              output_multipliers, output_shifts);
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCWeightsKernelInfo &dwc_weights_info,
-                                                        const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext     &compile_context,
+                                                        ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
-                                                  (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+    if (output == nullptr)
+    {
+        // In-place
+        output = input;
+    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+        conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+        (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+
+    auto padding_info = get_padding_info({input, output});
+
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+        *(input->info()), *(weights->info()), conv_info);
+    auto_init_if_empty(*(output->info()), input->info()
+                                              ->clone()
+                                              ->set_tensor_shape(output_shape)
+                                              .set_quantization_info(output->info()->quantization_info()));
+
+    _input                      = input;
+    _output                     = output;
+    _weights                    = weights;
+    _biases                     = biases;
+    _depth_multiplier           = conv_info.depth_multiplier;
+    _output_multipliers         = output_multipliers;
+    _output_shifts              = output_shifts;
+    _export_input_to_cl_image   = dwc_info.export_input_to_cl_image;
+    _export_weights_to_cl_image = dwc_info.export_weights_to_cl_image;
+    _is_quantized               = is_data_type_quantized(input->info()->data_type());
+
+    const unsigned int n0          = adjust_vec_size(dwc_info.n0, output->info()->dimension(0));
+    const unsigned int m0          = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1));
+    std::string        kernel_name = "";
 
-    auto padding_info = get_padding_info({ input, output });
+    CLBuildOptions build_opts;
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info, depth_multiplier, dilation);
-    auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+    // Update the padding for the input/weights tensor if we can export to cl_image
+    if (_export_input_to_cl_image)
+    {
+        arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
+    }
 
-    _input              = input;
-    _output             = output;
-    _weights            = weights;
-    _biases             = biases;
-    _depth_multiplier   = depth_multiplier;
-    _output_multipliers = output_multipliers;
-    _output_shifts      = output_shifts;
-    _is_quantized       = is_data_type_quantized(input->info()->data_type());
+    if (_export_weights_to_cl_image)
+    {
+        arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
+    }
 
-    const unsigned int n0 = adjust_vec_size(dwc_weights_info.n0, input->info()->dimension(0));
+    // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+    const GPUTarget gpu_target    = get_target();
+    const auto      act_function  = conv_info.act_info.activation();
+    const auto      dst_data_type = _output->info()->data_type();
 
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-    build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1, "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(_output->info()->dimension(2))));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(dwc_info.activation_info.activation())));
-    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    else
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(conv_info.depth_multiplier));
+    build_opts.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+    // Note: SRC_DATA_TYPE must have the same data type of WEI_DATA_TYPE. In quantized, we could
+    // have a case where the data types for the activation and weights are different. However, since the implementation
+    // only works when both have same data type, we have to change the offset to take into account this aspect
+    build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+    build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+    build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+    build_opts.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(1)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(2)));
+    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(1)));
+    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(2)));
+    build_opts.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(1)));
+    build_opts.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+    build_opts.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
+    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_top()));
+    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_left()));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().first));
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().second));
+    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(conv_info.dilation.x()));
+    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(conv_info.dilation.y()));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DSRC_DIM1=" + support::cpp11::to_string(_input->info()->dimension(1)));
-    build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights->info()->dimension(1)));
-    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights->info()->dimension(2)));
-    build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-    build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
-
-    std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc";
-
-    if(_is_quantized)
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
+    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+                                  "-DN0_A=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
+    build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+
+    // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
+    set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+                                        static_cast<int>(_weights->info()->dimension(1)),
+                                        static_cast<int>(_weights->info()->dimension(2))});
+
+    if (biases != nullptr)
     {
-        const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
+        build_opts.add_option(std::string("-DHAS_BIAS"));
+        build_opts.add_option(
+            std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+    }
 
-        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
-        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
-        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION");
+    if (_is_quantized)
+    {
+        kernel_name                          = "dwc_native_quantized_nhwc";
+        const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
 
-        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
-        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
+        PixelValue zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+        int        zero_value_s32;
+        zero_value.get(zero_value_s32);
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
         quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
-        if(dwc_info.activation_info.enabled())
-        {
-            int a_val{};
-            int b_val{};
-            std::tie(b_val, a_val) = get_quantized_activation_min_max(dwc_info.activation_info, input->info()->data_type(), oq_info);
-
-            const int o1 = oq_info.offset;
-
-            build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
-            build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
-            build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
-            const float s1 = iq_info.scale;
-            build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
-            build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
-        }
-
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
+        build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+        build_opts.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+        build_opts.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+        build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+        build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+        build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+                                      "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+        // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
+        int a_val{};
+        int b_val{};
+        std::tie(b_val, a_val) =
+            get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+
+        build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
+        build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
     }
     else
     {
-        build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.a()));
-        build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.b()));
+        kernel_name = "dwc_native_fp_nhwc";
+        build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
     }
 
-    Window win = calculate_max_window(*(output->info()), Steps(n0));
+    Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
     ICLKernel::configure_internal(win);
 
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -261,11 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     _config_id += string_from_data_type(input->info()->data_type());
 }
 
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                         const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info,
-                                                         unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo          *input,
+                                                         const ITensorInfo          *weights,
+                                                         const ITensorInfo          *biases,
+                                                         const ITensorInfo          *output,
+                                                         const DWCComputeKernelInfo &dwc_info,
+                                                         const ConvolutionInfo      &conv_info,
+                                                         const ITensorInfo          *output_multipliers,
+                                                         const ITensorInfo          *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
     return Status{};
 }
 
@@ -276,37 +416,61 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
 
     // Collapse window
     Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
-    Window slice_in         = window.first_slice_window_4D();
-    Window slice_out        = window_collapsed.first_slice_window_4D();
 
-    if(_depth_multiplier != 1)
-    {
-        ARM_COMPUTE_ERROR_ON(slice_out.x().step() != 1);
-        slice_out.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1));
-    }
+    Window slice = window_collapsed.first_slice_window_4D();
 
-    unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
+    cl::Image2D input_cl_image;
+    cl::Image2D weights_cl_image;
 
-    // Set output multipliers in case of quantized data type
-    if(_is_quantized)
+    if (_export_input_to_cl_image || _export_weights_to_cl_image)
     {
-        add_1D_tensor_argument(idx, _output_multipliers, slice_in);
-        add_1D_tensor_argument(idx, _output_shifts, slice_in);
+        // Export cl_buffer to cl_image
+        if (_export_input_to_cl_image)
+        {
+            const size_t image_w = _input->info()->dimension(0) / 4;
+            const size_t image_h =
+                _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
+            input_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+                                           _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        }
+
+        if (_export_weights_to_cl_image)
+        {
+            const size_t image_w = _weights->info()->dimension(0) / 4;
+            const size_t image_h =
+                _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
+            weights_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+                                           _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        }
     }
 
-    if(_biases != nullptr)
+    unsigned int idx = 0;
+    if (_export_input_to_cl_image)
     {
-        add_1D_tensor_argument(idx, _biases, slice_in);
+        _kernel.setArg(idx++, input_cl_image);
     }
-
-    do
+    add_4d_tensor_nhwc_argument(idx, _input);
+    add_4d_tensor_nhwc_argument(idx, _output);
+    if (_export_weights_to_cl_image)
+    {
+        _kernel.setArg(idx++, weights_cl_image);
+    }
+    add_4d_tensor_nhwc_argument(idx, _weights);
+    if (_is_quantized)
+    {
+        add_1D_tensor_argument(idx, _output_multipliers, slice);
+        add_1D_tensor_argument(idx, _output_shifts, slice);
+    }
+    if (_biases != nullptr)
     {
-        idx = 0;
-        add_4D_tensor_argument(idx, _input, slice_in);
-        add_4D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
+        add_1D_tensor_argument(idx, _biases, slice);
     }
-    while(window_collapsed.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+    enqueue(queue, *this, slice, lws_hint());
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 325f4e7067..d34a662966 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -47,85 +48,84 @@ public:
     CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
     /** Allow instances of this class to be moved */
     CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+
     /** Initialize the function's source, destination and parameters
      *
+     * @param[in]  compile_context    The compile context to be used.
      * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
      * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
      *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
      *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+     * @param[out] output             Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
      * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  conv_info          Convolution info (padding, stride, dilation, ...)
      * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
      *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
      *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     *
+     * @note: In-place is only supported when
+     *          * data layout: NHWC
+     *          * filter: 1x1
+     *          * @p depth_multiplier: 1
+     *          * strides: 1
+     *          * dilation: 1
+     *          * no padding
+     *          * no change of data layout after configure
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialize the function's source, destination and parameters
+    void configure(const CLCompileContext     &compile_context,
+                   ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
+
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
-     * @param[in] input              Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in] dwc_info           Depthwise convolution layer info
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo          *input,
+                           const ITensorInfo          *weights,
+                           const ITensorInfo          *biases,
+                           const ITensorInfo          *output,
+                           const DWCComputeKernelInfo &dwc_info,
+                           const ConvolutionInfo      &conv_info,
+                           const ITensorInfo          *output_multipliers = nullptr,
+                           const ITensorInfo          *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-    unsigned int     _depth_multiplier;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
+    const ICLTensor *_input{};
+    const ICLTensor *_weights{};
+    const ICLTensor *_biases{};
+    ICLTensor       *_output{};
+    unsigned int     _depth_multiplier{0};
+    const ICLTensor *_output_multipliers{};
+    const ICLTensor *_output_shifts{};
+    bool             _export_input_to_cl_image{false};
+    bool             _export_weights_to_cl_image{true};
+    bool             _is_quantized{false};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
deleted file mode 100644
index b10c23bde9..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.c0 != 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_h) != 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    if(output->total_size() != 0)
-    {
-        auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), reshaped_weights_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
-    auto reshaped_input_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
-    auto_init_if_empty(*output, reshaped_input_shape, 1, input->data_type(), input->quantization_info());
-
-    Window                 win = calculate_max_window(*input, Steps(info.c0));
-    AccessWindowHorizontal weights_access(input, 0, info.c0);
-    const bool             window_changed = update_window_and_padding(win, weights_access);
-
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayerReshapeWeightsKernel::CLDepthwiseConvolutionLayerReshapeWeightsKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
-    auto win_config = validate_and_configure_window(input->info(), output->info(), info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    ICLKernel::configure_internal(win_config.second);
-
-    _input  = input;
-    _output = output;
-
-    // Build the kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(info.c0));
-    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(0)));
-    build_opts.add_option_if(info.transpose, "-DTRANSPOSE");
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-
-    _kernel = create_kernel(compile_context, "depthwise_convolution_reshape_weights", build_opts.options());
-}
-
-Status CLDepthwiseConvolutionLayerReshapeWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info).first);
-    return Status{};
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, window);
-    add_2D_tensor_argument(idx, _output, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
deleted file mode 100644
index 650fe9a11b..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to reshape the weights of depthwise convolution. */
-class CLDepthwiseConvolutionLayerReshapeWeightsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
-    /** Default move assignment operator */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
-
-    /** Initialize the function's source and destination.
-     *
-     * @param[in]  input  The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[out] output The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in]  info   Depthwise convolution information to reshape the input tensor.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
-    /** Initialize the function's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[out] output          The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in]  info            Depthwise convolution information to reshape the input tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-     *
-     * @param[in] input  The input tensor info of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[in] output The output tensor info of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in] info   Depthwise convolution information to reshape the input tensor.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-
-    void configure_dot_product(const DepthwiseConvolutionReshapeInfo &info);
-    void configure_generic(const DepthwiseConvolutionReshapeInfo &info);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H */
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
deleted file mode 100644
index 3723c651fe..0000000000
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
-    if(output->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLDequantizationLayerKernel::CLDequantizationLayerKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLDequantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32);
-
-    auto padding_info = get_padding_info({ input, output });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    const int  vec_size_x     = 16 / output->info()->element_size();
-    const int  output_width_x = output->info()->tensor_shape().x();
-    const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
-    const bool  is_quantized_per_channel = is_data_type_quantized_per_channel(input->info()->data_type());
-    std::string kernel_name              = "dequantization_layer";
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
-    {
-        const UniformQuantizationInfo qinfo   = input->info()->quantization_info().uniform();
-        const int                     qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0;
-        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
-    }
-    else
-    {
-        kernel_name += "_per_channel";
-        kernel_name += input->info()->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
-    }
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-
-    // Create kernel name
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-    if(multi_access_x)
-    {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    // Set output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    return Status{};
-}
-
-void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(_input->info()->data_type());
-
-    // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice      = new_window.first_slice_window_3D();
-
-    if(is_quantized_per_channel)
-    {
-        unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
-        _kernel.setArg(idx++, _input->quantization().scale->cl_buffer());
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.h b/src/core/CL/kernels/CLDequantizationLayerKernel.h
deleted file mode 100644
index 5579b5bc71..0000000000
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the dequantization layer kernel. */
-class CLDequantizationLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDequantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDequantizationLayerKernel(const CLDequantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDequantizationLayerKernel &operator=(const CLDequantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDequantizationLayerKernel(CLDequantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLDequantizationLayerKernel &operator=(CLDequantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDequantizationLayerKernel() = default;
-    /** Set the input, output, min and max.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output Destination tensor. Data types supported: F16/F32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input, output, min and max.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output          Destination tensor. Data types supported: F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] output Output tensor info. Data types supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
deleted file mode 100644
index 2fc3c60f67..0000000000
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ /dev/null
@@ -1,585 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-
-    const DataLayout data_layout = input->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
-                                    "Weights feature map dimension should match the respective input's one");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
-                                    && std::get<0>(conv_info.stride()) > 2,
-                                    "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        if(is_data_type_quantized(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
-        }
-    }
-
-    if(biases != nullptr)
-    {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of input feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    const auto data_type = input->data_type();
-    if(is_data_type_quantized(data_type))
-    {
-        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-
-        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-    }
-    return Status{};
-}
-
-inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
-                                                      DataType data_type, DataLayout data_layout)
-{
-    return gpu_target_is_in(gpu_target,
-                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                            GPUTarget::G52, GPUTarget::G52LIT)
-           && (kernel_size <= 5)
-           && (conv_stride_x == 1) && (conv_stride_y == 1)
-           && (data_type == DataType::F32)
-           && (data_layout == DataLayout::NCHW);
-}
-
-inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
-                                 unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
-                                 unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input)
-{
-    const DataType   data_type     = input->data_type();
-    const DataLayout data_layout   = input->data_layout();
-    unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
-    unsigned int     conv_stride_y = std::get<1>(conv_info.stride());
-
-    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
-    if(run_optimized_bifrost)
-    {
-        // Configure kernel window
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 4;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 4;
-                break;
-            }
-            case 3:
-            {
-                num_elems_read_per_iteration_x    = 6;
-                num_elems_read_per_iteration_y    = 5;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 3;
-                break;
-            }
-            case 5:
-            {
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_read_per_iteration_y    = 6;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 2;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
-            }
-        }
-    }
-    else
-    {
-        num_elems_read_per_iteration_y    = kernel_size;
-        num_elems_written_per_iteration_x = 8;
-        num_elems_written_per_iteration_y = 1;
-        switch(kernel_size)
-        {
-            case 1:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 3:
-                        switch(input->element_size())
-                        {
-                            case 1:
-                                num_elems_read_per_iteration_x = 28;
-                                break;
-                            case 2:
-                                num_elems_read_per_iteration_x = 24;
-                                break;
-                            case 4:
-                                num_elems_read_per_iteration_x = 22;
-                                break;
-                            default:
-                                ARM_COMPUTE_ERROR("Invalid data size");
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 3:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 10;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 17;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 5:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 12;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 20;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 9:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 24;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid direct convolution size");
-        }
-    }
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
-{
-    const DataLayout data_layout = input->data_layout();
-
-    // Get output shape
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, output_shape,
-                       1,
-                       input->data_type(),
-                       input->quantization_info());
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        const unsigned int vec_size = std::min(static_cast<unsigned int>(output->tensor_shape()[0]), 4u);
-
-        // Create window and update padding
-        Window win = calculate_max_window(*output, Steps(vec_size, 1U));
-        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-        Status err = Status{};
-        return std::make_pair(err, win);
-    }
-    else if(data_layout == DataLayout::NCHW)
-    {
-        const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const unsigned int kernel_size = weights->dimension(width_idx);
-
-        unsigned int num_elems_read_per_iteration_x    = 0;
-        unsigned int num_elems_read_per_iteration_y    = 0;
-        unsigned int num_elems_written_per_iteration_x = 0;
-        unsigned int num_elems_written_per_iteration_y = 0;
-
-        unsigned int conv_pad_left = conv_info.pad_left();
-        unsigned int conv_pad_top  = conv_info.pad_top();
-        unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-        unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
-        setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                             num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
-                             kernel_size, conv_info, target, input);
-
-        // Create window and update padding
-        bool   window_changed = false;
-        Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-        AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
-        AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_pair(err, win);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-} // namespace
-
-CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
-    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_info()
-{
-}
-
-BorderSize CLDirectConvolutionLayerKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
-}
-
-void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                               const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-    // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  weights->info(),
-                                                  (biases != nullptr) ? biases->info() : nullptr,
-                                                  output->info(),
-                                                  conv_info));
-
-    _conv_stride_x = std::get<0>(conv_info.stride());
-    _conv_stride_y = std::get<1>(conv_info.stride());
-    _data_layout   = input->info()->data_layout();
-    _input         = input;
-    _weights       = weights;
-    _output        = output;
-    _biases        = biases;
-    _conv_info     = conv_info;
-
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int kernel_size = weights->info()->dimension(width_idx);
-    const DataType     data_type   = input->info()->data_type();
-
-    const GPUTarget gpu_target = get_target();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    std::stringstream kernel_name;
-    CLBuildOptions    build_options;
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        _border_size = BorderSize();
-
-        kernel_name << "direct_convolution_nhwc";
-
-        const unsigned int n0               = win_config.second.x().step();
-        const unsigned int m0               = win_config.second.y().step();
-        const unsigned int k0               = adjust_vec_size(16u, _input->info()->dimension(channel_idx));
-        const unsigned int partial_store_n0 = _output->info()->dimension(channel_idx) % n0;
-        const unsigned int partial_store_m0 = (_output->info()->dimension(width_idx) * _output->info()->dimension(height_idx)) % m0;
-        const unsigned int pad_left         = conv_info.pad_left();
-        const unsigned int pad_top          = conv_info.pad_top();
-
-        if(_biases != nullptr)
-        {
-            build_options.add_option(std::string("-DHAS_BIAS"));
-            build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(_biases->info()->data_type())));
-        }
-        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx)));
-        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx)));
-        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(_input->info()->dimension(channel_idx)));
-        build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
-        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx)));
-        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx)));
-        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->dimension(channel_idx)));
-        build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(_output->info()->data_type()));
-        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(width_idx)));
-        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(height_idx)));
-        build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
-        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
-        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
-        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
-        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
-        build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
-        build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
-        build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
-        build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-        build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-
-        if(is_data_type_quantized(data_type))
-        {
-            const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
-            const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
-            const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
-
-            PixelValue zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
-            int        zero_value_s32;
-            zero_value.get(zero_value_s32);
-
-            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-            int   output_multiplier = 0;
-            int   output_shift      = 0;
-            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-            build_options.add_option("-DIS_QUANTIZED");
-            build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-            build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
-        }
-        else
-        {
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-        }
-    }
-    else
-    {
-        _border_size = BorderSize(_input->info()->padding());
-
-        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
-        build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
-
-        const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
-
-        if(run_optimized_for_bifrost)
-        {
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
-
-            kernel_name << "_f32_bifrost";
-        }
-        else
-        {
-            build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
-            build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
-            build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
-            build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-
-            if(is_data_type_quantized(data_type))
-            {
-                const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
-                const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
-                const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
-
-                float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-                int   output_multiplier = 0;
-                int   output_shift      = 0;
-                quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-                build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-                build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-                build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
-                build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-                build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-                build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-
-                kernel_name.str("direct_convolution_quantized");
-            }
-        }
-    }
-
-    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(kernel_size);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().left);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().top);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().right);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().bottom);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_conv_stride_x);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_conv_stride_y);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(width_idx));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                const GPUTarget target)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);
-
-    return Status{};
-}
-
-void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Get initial windows
-    Window slice = window.first_slice_window_3D();
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1) * _output->info()->dimension(2), 1));
-        slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(3), 1));
-
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        add_3D_tensor_argument(idx, _weights, slice);
-        if(_biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, _biases, slice);
-        }
-        _kernel.setArg(idx++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    else
-    {
-        Window win_in = window;
-
-        win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
-        win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);
-
-        const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-        const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-        win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
-        win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
-
-        Window       slice_in = win_in.first_slice_window_3D();
-        unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
-        add_3D_tensor_argument(idx1, _weights, slice);
-
-        if(_biases != nullptr)
-        {
-            Window slice_biases;
-            slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-            add_1D_tensor_argument(idx1, _biases, slice_biases);
-        }
-
-        _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, slice_in);
-            add_3D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
deleted file mode 100644
index 0257d0c2dd..0000000000
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the  direct convolution kernel.
- */
-class CLDirectConvolutionLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDirectConvolutionLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerKernel(const CLDirectConvolutionLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerKernel &operator=(const CLDirectConvolutionLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerKernel(CLDirectConvolutionLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerKernel &operator=(CLDirectConvolutionLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDirectConvolutionLayerKernel() = default;
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
-     *
-     * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                       Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                             Data type supported:Same as @p input.
-     * @param[in]  biases          Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] output          Output tensor.
-     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel
-     *
-     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                      Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] target    Target GPU architecture.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    const ICLTensor *_weights;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    BorderSize       _border_size;
-    int              _conv_stride_x;
-    int              _conv_stride_y;
-    PadStrideInfo    _conv_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 22eee11c8a..3d8f875ef7 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -35,17 +38,20 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -55,33 +61,42 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
     auto_init_if_empty(*output, input->clone()->set_num_channels(2));
 
     Window win = calculate_max_window(*output, Steps());
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     return std::make_pair(Status{}, win);
 }
 } // namespace
 
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
-    : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
 }
 
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext          &compile_context,
+                                        const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
-    auto padding_info = get_padding_info({ input, output, idx });
+    auto padding_info = get_padding_info({input, output, idx});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
 
     _input  = input;
@@ -112,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
 
     return Status{};
 }
@@ -135,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _output, slice);
         add_1D_tensor_argument(idx, _idx, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
index e5583a4c22..fdd1bcc3d3 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -56,7 +56,8 @@ public:
      * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
      * @param[in]  config Kernel configuration.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void
+    configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -65,7 +66,11 @@ public:
      * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
      * @param[in]  config          Kernel configuration.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void configure(const CLCompileContext          &compile_context,
+                   const ICLTensor                 *input,
+                   ICLTensor                       *output,
+                   const ICLTensor                 *idx,
+                   const FFTDigitReverseKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
@@ -75,7 +80,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 5db3cb6bf2..3729e6b77d 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -58,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -70,18 +73,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     steps.set(config.axis, config.radix);
 
     Window win = calculate_max_window(*input, steps);
-    if(output != nullptr)
-    {
-        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    }
 
     return std::make_pair(Status{}, win);
 }
 } // namespace
 
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
@@ -89,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext        &compile_context,
+                                      ICLTensor                     *input,
+                                      ICLTensor                     *output,
+                                      const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
-    auto padding_info = get_padding_info({ input, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -112,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments if not the first stage
-    if(!config.is_first_stage)
+    if (!config.is_first_stage)
     {
         const unsigned int Ni        = config.Nx * config.radix;
         const float        exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
-        unsigned int       idx       = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+        unsigned int       idx =
+            (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
         _kernel.setArg<cl_uint>(idx++, config.Nx);
         _kernel.setArg<cl_uint>(idx++, Ni);
         _kernel.setArg<cl_float>(idx, exp_const);
@@ -138,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -167,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
index 9bb310db83..de80bfced3 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 #include <set>
 
 namespace arm_compute
@@ -69,7 +69,10 @@ public:
      * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
      * @param[in]     config          FFT descriptor metadata.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+    void configure(const CLCompileContext        &compile_context,
+                   ICLTensor                     *input,
+                   ICLTensor                     *output,
+                   const FFTRadixStageKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index edcf5d5a5d..be6e16b074 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -41,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -50,30 +53,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    if(output != nullptr)
-    {
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output, *input->clone());
-
-        // CLFFTScaleKernel doesn't need padding so update_window_and_padding() can be skipped
-        Coordinates coord;
-        coord.set_num_dimensions(output->num_dimensions());
-        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-    }
-
-    return std::make_pair(Status{}, win);
-}
 } // namespace
 
-CLFFTScaleKernel::CLFFTScaleKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
@@ -81,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext   &compile_context,
+                                 ICLTensor                *input,
+                                 ICLTensor                *output,
+                                 const FFTScaleKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -94,20 +81,28 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+                                                                                      : input->info()->num_channels()));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(config.conjugate, "-DCONJ");
     std::string kernel_name = "fft_scale_conj";
     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments
-    unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    unsigned int idx =
+        (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx, config.scale);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), _run_in_place ? nullptr : output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    if (output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), *input->info()->clone());
+    }
+
+    ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
@@ -124,7 +119,6 @@ Status CLFFTScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *o
 {
     ARM_COMPUTE_UNUSED(config);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
@@ -141,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
index cc518be193..b995282e02 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
 #define ARM_COMPUTE_CLFFTSCALEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -63,7 +63,10 @@ public:
      * @param[out]    output          Destination tensor. Data type supported: same as @p input
      * @param[in]     config          Kernel configuration
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+    void configure(const CLCompileContext   &compile_context,
+                   ICLTensor                *input,
+                   ICLTensor                *output,
+                   const FFTScaleKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 840ed0ca2f..86bb502da3 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,16 +29,18 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLFillBorderKernel::CLFillBorderKernel()
-    : ICLKernel(), _tensor(nullptr)
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 bool CLFillBorderKernel::is_parallelisable() const
@@ -54,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
     ICLKernel::add_argument<T>(idx, static_cast<T>(value));
 }
 
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor        *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ICLTensor              *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     _tensor = tensor;
     configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ITensorInfo            *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
-    auto padding_info = get_padding_info({ tensor });
+    auto padding_info = get_padding_info({tensor});
 
     border_size.limit(tensor->padding());
 
     // If there is no border: early exit
-    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
     {
         return;
     }
@@ -96,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Create static kernel arguments
-    const unsigned int valid_width  = tensor->valid_region().shape[0];
-    const unsigned int valid_height = tensor->valid_region().shape[1];
-    const cl_int2      valid_region_coords =
-    {
-        {
-            static_cast<cl_int>(tensor->valid_region().anchor[0]),
-            static_cast<cl_int>(tensor->valid_region().anchor[1]),
-        }
-    };
-    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+    const unsigned int valid_width         = tensor->valid_region().shape[0];
+    const unsigned int valid_height        = tensor->valid_region().shape[1];
+    const cl_int2      valid_region_coords = {{
+             static_cast<cl_int>(tensor->valid_region().anchor[0]),
+             static_cast<cl_int>(tensor->valid_region().anchor[1]),
+    }};
+    const unsigned int total_valid_width   = border_size.left + valid_width + border_size.right;
 
     // Set static kernel arguments
     unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
     ICLKernel::add_argument<cl_uint>(idx, valid_width);
     ICLKernel::add_argument<cl_uint>(idx, valid_height);
     ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
-    if(BorderMode::CONSTANT == border_mode)
+    if (BorderMode::CONSTANT == border_mode)
     {
-        switch(dt)
+        switch (dt)
         {
             case DataType::U8:
             case DataType::QASYMM8:
@@ -173,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
 void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
@@ -191,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
@@ -214,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
index 7951f48171..5782143cf9 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -57,7 +58,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +70,10 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ICLTensor        *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]     compile_context       The compile context to be used.
@@ -74,7 +82,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
 
     /** Function to set the constant value on fill border kernel depending on type.
      *
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 2116239080..7da0679ae4 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,27 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -53,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
 
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -100,27 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
 } // namespace
 
 CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext    &compile_context,
+                                               const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
-    auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+    auto padding_info =
+        get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
 
     _input_weights = input_weights;
     _input_bias    = input_bias;
@@ -133,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     _epsilon       = epsilon;
 
     _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
-    _run_in_place_bias    = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+    _run_in_place_bias =
+        (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
     // Configure kernel window
     Window win = calculate_max_window(*input_weights->info());
@@ -163,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
-    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+                             "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
     build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
     build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
     build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -178,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -200,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::
     // Add kernel arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input_weights, slice_3d);
-    if(_input_bias != nullptr)
+    if (_input_bias != nullptr)
     {
         add_1D_tensor_argument(idx, _input_bias, slice_1d);
     }
     add_1D_tensor_argument(idx, _bn_mean, slice_1d);
     add_1D_tensor_argument(idx, _bn_var, slice_1d);
-    if(!_run_in_place_weights)
+    if (!_run_in_place_weights)
     {
         add_3D_tensor_argument(idx, _fused_weights, slice_3d);
     }
-    if(!_run_in_place_bias)
+    if (!_run_in_place_bias)
     {
         add_1D_tensor_argument(idx, _fused_bias, slice_1d);
     }
-    if(_bn_beta != nullptr)
+    if (_bn_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_beta, slice_1d);
     }
-    if(_bn_gamma != nullptr)
+    if (_bn_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
     }
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 78b1e74cab..76ec7a759f 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -62,9 +62,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the source, destination of the kernel
      *
      * @param[in]  compile_context The compile context to be used.
@@ -81,9 +88,17 @@ public:
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -101,10 +116,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index 9215fd602d..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(input0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    ARM_COMPUTE_UNUSED(m);
-    ARM_COMPUTE_UNUSED(n);
-    ARM_COMPUTE_UNUSED(k);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
-
-    Window win{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // RHS matrix still needs padding on the X
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-                                     input1->dimension(1));
-
-    window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                     const GEMMReshapeInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                     const GEMMRHSMatrixInfo &rhs_info,
-                                                     const GEMMReshapeInfo   &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ input0, output });
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    std::string kernel_name("gemmlowp_mm_native");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 125f0c6948..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
-class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same as lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 848f272e50..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    TensorShape tensor_shape0{ input0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
-    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*output);
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    return std::make_pair(Status{}, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                       const GEMMReshapeInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                       const GEMMRHSMatrixInfo &rhs_info,
-                                                       const GEMMReshapeInfo   &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _k                        = gemm_info.k();
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    auto              padding_info = get_padding_info({ input0, input1, output });
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
-
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_");
-    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
-    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 100100b1b1..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel
-     *
-     * @param[in] input0    Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index d39900a561..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,573 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(input0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m;
-    const int n = gemm_info.n;
-    const int k = gemm_info.k;
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info1          = input1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-        }
-    }
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
-                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
-
-    // Checks performed if the output stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]);
-        }
-
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-            // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x();
-
-            // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2]));
-            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]);
-
-            if(expected_output_shape.num_dimensions() > 1)
-            {
-                const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-                vector_sum_row_shape.collapse_from(1);
-                TensorShape collapsed_output_shape(expected_output_shape);
-                collapsed_output_shape.collapse_from(output_batch_idx);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx],
-                                                "vector_sum_row must have the same number of batches of output tensor");
-
-                if(gemm_info.a_offset != 0)
-                {
-                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                    vector_sum_col_shape.collapse_from(1);
-
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-                }
-            }
-        }
-
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-
-        if(output_multipliers != nullptr && output_shifts != nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0));
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0));
-            }
-        }
-    }
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
-{
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
-    {
-        auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type));
-    }
-    else
-    {
-        auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32));
-    }
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
-    num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        if(gemm_info.a_offset != 0)
-        {
-            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
-        }
-        // No access window needed for vector_sum_row
-        ARM_COMPUTE_UNUSED(vector_sum_row);
-
-        if(bias != nullptr)
-        {
-            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, bias_access);
-        }
-
-        if(output_multipliers != nullptr && output_multipliers->dimension(0) > 1)
-        {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
-            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
-        }
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel()
-    : _input0(nullptr),
-      _input1(nullptr),
-      _output(nullptr),
-      _vector_sum_col(nullptr),
-      _vector_sum_row(nullptr),
-      _bias(nullptr),
-      _output_multipliers(nullptr),
-      _output_shifts(nullptr),
-      _slide_matrix_b(true),
-      _reinterpret_input_as_3d(false),
-      _reinterpret_output_as_3d(false),
-      _use_dummy_work_items(false),
-      _is_quantized_per_channel(false),
-      _fuse_output_stage(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info,
-                                                              const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(),
-                                                  input1->info(),
-                                                  output->info(),
-                                                  gemm_info,
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  output_multipliers != nullptr ? output_multipliers->info() : nullptr,
-                                                  output_shifts != nullptr ? output_shifts->info() : nullptr));
-
-    auto                          padding_info = get_padding_info({ input0, input1, output, vector_sum_row });
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-    const int32_t                 a_offset     = gemm_info.a_offset;
-    const int32_t                 b_offset     = gemm_info.b_offset;
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _vector_sum_col           = vector_sum_col;
-    _vector_sum_row           = vector_sum_row;
-    _bias                     = bias;
-    _output_multipliers       = output_multipliers;
-    _output_shifts            = output_shifts;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(),
-                                                    input1->info(),
-                                                    output->info(),
-                                                    gemm_info,
-                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                    bias != nullptr ? bias->info() : nullptr,
-                                                    output_multipliers != nullptr ? output_multipliers->info() : nullptr,
-                                                    output_shifts != nullptr ? output_shifts->info() : nullptr,
-                                                    num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % internal_m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        kernel_name += "_fused_output_stage_fixedpoint";
-        _fuse_output_stage = true;
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
-        {
-            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-            build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-        }
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0)));
-        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-        build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-        build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-        build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-
-        const int min = output_stage.gemmlowp_min_bound;
-        const int max = output_stage.gemmlowp_max_bound;
-
-        PixelValue min_val{};
-        PixelValue max_val{};
-        std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_reinterpret_output_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_fuse_output_stage)
-        {
-            add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-            add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-            add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
-        }
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index 222a8615e4..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
- */
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input0             Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1             Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output             Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in] gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                               Only the following values are supported for LHS info:
-     *                               lhs_info.m0: 2,3,4,5,6,7,8
-     *                               lhs_info.k0: 2,3,4,8,16
-     *                               Only the following values are supported for RHS info:
-     *                               rhs_info.n0: 2,3,4,8,16
-     *                               rhs_info.k0: same as lhs_info.k0
-     *                               rhs_info.transpose: true
-     * @param[in] vector_sum_col     (Optional) Input row-vector info of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in] vector_sum_row     (Optional) Input row-vector info of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in] bias               (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     * @param[in] output_shifts      (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr,
-                           const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr,
-                           const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _is_quantized_per_channel;
-    bool             _fuse_output_stage;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
deleted file mode 100644
index c7844b9c28..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
-{
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                                                   int32_t b_offset)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset);
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
-                                                   const ICLTensor *bias,
-                                                   int32_t k, int32_t a_offset,
-                                                   int32_t b_offset)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  a_offset, b_offset)); // NOLINT
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
-
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _mm_result      = mm_result;
-    _bias           = bias;
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
-    return Status{};
-}
-
-void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _mm_result, slice);
-        add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index f8705595a0..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class CLGEMMLowpOffsetContributionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
-     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
-     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k               Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                   int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    ICLTensor       *_mm_result;
-    const ICLTensor *_bias;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index b41d8704bd..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
-    // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
-    : _mm_result(nullptr),
-      _vector_sum_col(nullptr),
-      _vector_sum_row(nullptr),
-      _bias(nullptr),
-      _output(nullptr),
-      _output_multipliers(nullptr),
-      _output_shifts(nullptr),
-      _is_quantized_per_channel(false)
-{
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
-                                                              const ICLTensor *bias, ICLTensor *output,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  a_offset, b_offset, output_stage,
-                                                  output_multipliers->info(), output_shifts->info())); // NOLINT
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts });
-
-    const int min = output_stage.gemmlowp_min_bound;
-    const int max = output_stage.gemmlowp_max_bound;
-
-    _vector_sum_col           = vector_sum_col;
-    _vector_sum_row           = vector_sum_row;
-    _mm_result                = mm_result;
-    _bias                     = bias;
-    _output                   = output;
-    _output_multipliers       = output_multipliers;
-    _output_shifts            = output_shifts;
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    // Auto initialize the output
-    auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-    build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    PixelValue min_val{};
-    PixelValue max_val{};
-    std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-    build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-    kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
-    return Status{};
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _mm_result, slice);
-        add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 15f54d17a5..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
- * of matrix A and matrix B and performs the output stage defined by the output_stage argument
- *
- * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
- */
-class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
-                   int32_t k,
-                   int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result          Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] a_offset           Offset to be added to each element of the matrix A.
-     * @param[in] b_offset           Offset to be added to each element of the matrix B.
-     * @param[in] output_stage       GEMMLowp output stage info
-     * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     * @param[in] output_shifts      Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_mm_result;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized_per_channel;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 6a58d5e202..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                    const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                   const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = info->gemmlowp_min_bound;
-    auto           max = info->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
deleted file mode 100644
index 8653102cd8..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16.
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
deleted file mode 100644
index a5888a5ded..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-class Coordinates;
-CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                               const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                              const GEMMLowpOutputStageInfo *info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                              const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    auto min = info->gemmlowp_min_bound;
-    auto max = info->gemmlowp_max_bound;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
-    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
deleted file mode 100644
index 0a8d5e1942..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: S32
-     * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info   Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index 7d4352479c..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} //namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                       const GEMMLowpOutputStageInfo *output_stage)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  output_stage));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = output_stage->gemmlowp_min_bound;
-    auto           max = output_stage->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-}
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index abdf33ea43..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage    GEMMLowp output stage metadata.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] input        Input tensor. Data type supported: S32
-     * @param[in] bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                         Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
deleted file mode 100644
index d508bf6f21..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
-    }
-    return Status{};
-}
-
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
-    }
-    return Status{};
-}
-} // namespace
-
-ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
-    : _input(), _output()
-{
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info);
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
-
-    _input  = mtx_a;
-    _output = vector_sum_row;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-
-    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    // This kernel does not need padding
-    Window win = calculate_max_window(*vector_sum_row->info(), Steps());
-    ICLKernel::configure_internal(win);
-
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
-    Window slice_in  = collapsed.first_slice_window_2D();
-    Window slice_out = collapsed.first_slice_window_2D();
-
-    // Setup input slice. Its dimensions are increased in the cl kernel.
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info);
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
-
-    _input  = mtx_b;
-    _output = vector_sum_col;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
-    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
-    Window slice_out = collapsed.first_slice_window_2D();
-    Window slice_in  = slice_out;
-
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
deleted file mode 100644
index 237d8099b7..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all OpenCL reduction kernels */
-class ICLGEMMLowpReductionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    ICLGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
deleted file mode 100644
index 6d3b1e5897..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float beta,
-                                 bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (input2 != nullptr)
-                                    && (!reshape_info.broadcast_bias()),
-                                    "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
-
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
-        if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-        {
-            const unsigned int m           = reshape_info.reinterpret_input_as_3d() ? input0->dimension(1) * input0->dimension(2) : input0->dimension(1);
-            const unsigned int n           = input1->dimension(0);
-            const unsigned int input2_dim0 = input2->dimension(0);
-            const unsigned int input2_dim1 = input2->dimension(1);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
-            if(reshape_info.broadcast_bias())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
-            }
-        }
-    }
-    else
-    {
-        GEMMRHSMatrixInfo rhs_info;
-        GEMMLHSMatrixInfo lhs_info;
-        const auto        m                         = static_cast<unsigned int>(reshape_info.m());
-        const auto        n                         = static_cast<unsigned int>(reshape_info.n());
-        const int         k                         = reshape_info.k();
-        const int         mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int         mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-        rhs_info.n0                                 = max_cl_vector_width / input1->element_size();
-        rhs_info.k0                                 = 1;
-        rhs_info.h0                                 = mult_transpose1xW_width;
-        rhs_info.interleave                         = false;
-        rhs_info.transpose                          = false;
-        lhs_info.m0                                 = 4;
-        lhs_info.k0                                 = 4;
-        lhs_info.v0                                 = mult_interleave4x4_height;
-        lhs_info.interleave                         = true;
-        lhs_info.transpose                          = true;
-
-        TensorShape tensor_shape0{ input0->tensor_shape() };
-        tensor_shape0.set(0, k);
-        tensor_shape0.set(1, m);
-
-        TensorShape tensor_shape1{ input1->tensor_shape() };
-        tensor_shape1.set(0, n);
-        tensor_shape1.set(1, k);
-
-        const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
-        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-        if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-        {
-            const unsigned int input2_dim0 = input2->dimension(0);
-            const unsigned int input2_dim1 = input2->dimension(1);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
-            if(reshape_info.broadcast_bias())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
-            }
-        }
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-    }
-
-    return Status{};
-}
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
-                                                               float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
-                                                               ElementsProcessed &num_elements_processed)
-{
-    ARM_COMPUTE_UNUSED(beta);
-    bool   window_changed = false;
-    Window win{};
-    Window win_out{};
-
-    const DataType data_type                           = input0->data_type();
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool           reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
-    bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_input_as_3d  = false;
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    if(is_interleaved_transposed)
-    {
-        // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
-        ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
-
-        // Configure kernel window
-        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        if(input2 != nullptr)
-        {
-            const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-            const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
-            AccessWindowStatic input2_access(input2, 0, 0,
-                                             ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                             ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
-            window_changed = update_window_and_padding(win, input2_access); // window used by the execute_window_loop
-        }
-    }
-    else // The input tensors have not been reshaped
-    {
-        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
-        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
-        // Create kernels according to the architecture, data type and input size.
-        GPUTarget arch_target = get_arch_from_target(gpu_target);
-        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
-        {
-            num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
-        }
-
-        // Configure window
-        win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
-        AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
-        AccessWindowStatic output_access(output, 0, 0,
-                                         output->dimension(0),
-                                         output->dimension(1));
-
-        if(input2 != nullptr)
-        {
-            const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-            AccessWindowStatic input2_access(input2, 0, 0,
-                                             ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                             input2->dimension(1));
-
-            window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
-        }
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->num_dimensions());
-        output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _add_bias(false),
-      _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);
-}
-
-void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
-                                           float beta,
-                                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
-                                                  is_interleaved_transposed, reshape_info, fp_mixed_precision));
-
-    auto padding_info = is_interleaved_transposed ? get_padding_info({ input0, input1, output }) : get_padding_info({ input0, output });
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
-    _output                   = output;
-    _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
-    _add_bias                 = _input2 != nullptr;
-    _broadcast_bias           = reshape_info.broadcast_bias();
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
-
-    _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    const DataType data_type = input0->info()->data_type();
-
-    // Get target architecture
-    GPUTarget gpu_target = get_target();
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
-                                                    gpu_target, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
-    // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1)
-    const unsigned int internal_m = _reinterpret_output_as_3d ? output->info()->dimension(1) * output->info()->dimension(2) : output->info()->dimension(1);
-    const unsigned int n          = output->info()->dimension(0);
-
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
-    const unsigned int m0 = num_elements_processed.y();
-    const unsigned int n0 = num_elements_processed.x();
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % m0;
-    const unsigned int partial_store_n0 = n % n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
-    build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
-    build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
-
-    const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
-    std::string kernel_name;
-    if(is_interleaved_transposed)
-    {
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
-        build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
-        build_opts.add_option("-DK=" + support::cpp11::to_string(input1->info()->dimension(0) / (n0 * mult_transpose1xW_width)));
-        build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
-        build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-        if(is_data_type_float(data_type) && is_bifrost)
-        {
-            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
-        }
-        else
-        {
-            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
-            if(fp_mixed_precision && data_type == DataType::F16)
-            {
-                // currently wider accumulator is only supported for fp16 kernels.
-                kernel_name += "_acc32";
-            }
-        }
-    }
-    else // The input tensors have not been reshaped
-    {
-        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
-        build_opts.add_option("-DK=" + support::cpp11::to_string(input0->info()->dimension(0)));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-        build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-        build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-        // Create kernels according to the architecture, data type and input size.
-        if(is_data_type_float(data_type) && is_bifrost)
-        {
-            kernel_name = "gemm_mm_floating_point";
-
-            if(input0->info()->num_dimensions() != 1)
-            {
-                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
-                if(fp_mixed_precision && data_type == DataType::F16)
-                {
-                    // currently wider accumulator is only supported for fp16 kernels.
-                    kernel_name += "_acc32";
-                }
-            }
-            else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
-            {
-                // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
-                // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
-                // FC6 and FC7 of AlexNet and VGG-16).
-                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
-            }
-
-            // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
-            // via exhaustive autotuning over a range of representative layer configurations.
-            set_lws_hint(cl::NDRange(4));
-        }
-        else // (MIDGARD and F32) or (F16)
-        {
-            kernel_name = "gemm_mm_floating_point";
-        }
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "gemm_";
-    _config_id += (is_interleaved_transposed ? "reshaped_" : "");
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(3));
-    _config_id += "_";
-    _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
-                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
-    // Note: num_elements_processed will be set in validate_and_configure_window()
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(activation_info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              (input2 != nullptr) ? input2->clone().get() : nullptr,
-                                                              output->clone().get(),
-                                                              beta,
-                                                              is_interleaved_transposed,
-                                                              reshape_info,
-                                                              gpu_target,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        if(_add_bias)
-        {
-            add_2D_tensor_argument(idx, _input2, slice);
-        }
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
-        }
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 71d223b8ac..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result.
- *  For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object
- *
- * @note If the input tensors @p input0 and @p input1 have been reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel" and @ref CLGEMMReshapeRHSMatrixKernel,
- *       the flag @p is_interleaved_transposed must be set to true
- *
- * @attention @p input1 tensor must have at least 2 dimensions (matrix)
- *
- */
-class CLGEMMMatrixMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyKernel(const CLGEMMMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyKernel &operator=(const CLGEMMMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyKernel(CLGEMMMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  compile_context           The compile context to be used.
-     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0                    Input tensor containing the Matrix A info. Data types supported: F16/F32
-     * @param[in] input1                    Input tensor containing the Matrix B info. Data type supported: same as @p input0
-     * @param[in] input2                    Input tensor containing the Matrix C (bias) info. Can be nullptr. Data type supported: same as @p input0
-     * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha                     Weight of the matrix product
-     * @param[in] beta                      Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in] gpu_target                GPU Target
-     * @param[in] fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in] activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
-                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index f07166e4bb..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                          const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMKernelInfo    &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    ARM_COMPUTE_UNUSED(m);
-    ARM_COMPUTE_UNUSED(n);
-    ARM_COMPUTE_UNUSED(k);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != k);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != n);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != k);
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != m);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != m);
-    }
-
-    if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int input2_dim0 = input2->dimension(0);
-        const unsigned int input2_dim1 = input2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic input0_access(input0, 0, 0,
-                                     input0->dimension(0),
-                                     input0->dimension(1));
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-                                     input1->dimension(1));
-    AccessWindowStatic output_access(output, 0, 0,
-                                     output->dimension(0),
-                                     output->dimension(1));
-
-    if(input2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic input2_access(input2, 0, 0,
-                                         ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                         input2->dimension(1));
-
-        window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
-    }
-
-    output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyNativeKernel::CLGEMMMatrixMultiplyNativeKernel()
-    : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
-      _add_bias(false), _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                                                 const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
-                                                 float                    beta,
-                                                 const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    auto padding_info         = get_padding_info({ input0, output });
-    _input0                   = input0;
-    _input1                   = input1;
-    _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
-    _output                   = output;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = _input2 != nullptr;
-    _broadcast_bias           = gemm_info.broadcast_bias;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-
-    std::string kernel_name("gemm_mm_native");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
-                                                  const GEMMLHSMatrixInfo &lhs_info,
-                                                  const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              input2 != nullptr ? input2->clone().get() : nullptr,
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4;
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-        }
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        if(_add_bias)
-        {
-            add_2D_tensor_argument(idx, _input2, slice);
-        }
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
-        }
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 6b6004b464..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
-class CLGEMMMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyNativeKernel(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyNativeKernel &operator=(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyNativeKernel(CLGEMMMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyNativeKernel &operator=(CLGEMMMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor info. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same of lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 9f1ffa48eb..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/gemm/CLGEMMHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                          const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMKernelInfo    &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (input0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
-    ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(*input1, rhs_info));
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    TensorShape tensor_shape0{ input0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int input2_dim0 = input2->dimension(0);
-        const unsigned int input2_dim1 = input2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
-    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic input0_access(input0, 0, 0,
-                                     input0->dimension(0),
-                                     input0->dimension(1));
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     input1->dimension(0),
-                                     input1->dimension(1));
-    AccessWindowStatic output_access(output, 0, 0,
-                                     output->dimension(0),
-                                     output->dimension(1));
-
-    if(input2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
-        AccessWindowStatic input2_access(input2, 0, 0,
-                                         ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                         ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
-        window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
-    }
-
-    output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
-    : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), _add_bias(false),
-      _broadcast_bias(false), _export_to_cl_image(false), _k(1)
-{
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info,
-                                                   const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
-                                                   float                    beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info,
-                                                   const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    auto padding_info         = get_padding_info({ input0, output });
-    _input0                   = input0;
-    _input1                   = input1;
-    _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
-    _output                   = output;
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = _input2 != nullptr;
-    _broadcast_bias           = gemm_info.broadcast_bias;
-    _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _k                        = gemm_info.k;
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    const bool     enable_mixed_precision = gemm_info.fp_mixed_precision;
-    const DataType data_type              = input0->info()->data_type();
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
-    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
-    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    std::string kernel_name("gemm_mm_reshaped_");
-    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
-    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
-    _config_id += "_";
-    _config_id += (enable_mixed_precision ? "mixed_precision_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
-                                                    const GEMMLHSMatrixInfo &lhs_info,
-                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              input2 != nullptr ? input2->clone().get() : nullptr,
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-
-    cl::Image2D input1_image2d;
-
-    if(_export_to_cl_image)
-    {
-        const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
-        const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
-
-        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, _input1->info()->data_type(), image_row_pitch);
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-
-        // LHS buffer
-        add_2D_tensor_argument(idx, _input0, slice);
-
-        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
-        {
-            _kernel.setArg(idx++, input1_image2d);
-        }
-        else
-        {
-            add_2D_tensor_argument(idx, _input1, slice_b);
-        }
-
-        // Bias buffer (_add_bias == true)
-        add_2D_tensor_argument_if(_add_bias, idx, _input2, slice);
-
-        // Output buffer
-        add_2D_tensor_argument(idx, _output, slice);
-
-        // K dimension (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-
-        // LHS stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-
-        // RHS stride_z (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-
-        // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
-        }
-
-        // Output stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-
-        // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
-        }
-
-        // Dispatch kernel
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 2ffc322def..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedKernel(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedKernel &operator=(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedKernel(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedKernel &operator=(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
-     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                       rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
-     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                             rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
-     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in] input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in] input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                      rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-    bool             _export_to_cl_image;
-    unsigned int     _k;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index 3dee4f24cd..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/gemm/CLGEMMHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                          const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMKernelInfo    &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
-    ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(*input1, rhs_info));
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int input2_dim0 = input2->dimension(0);
-        const unsigned int input2_dim1 = input2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input0);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != k);
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != m);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != m);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    // This approach should only be used when the input/output tensors have pad on the y direction
-    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(input2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic input2_access(input2, 0, 0,
-                                         ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                         input2->dimension(1));
-
-        window_changed = update_window_and_padding(win, input2_access);
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
-    : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
-      _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false), _has_pad_y(false)
-{
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info,
-                                                          const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
-                                                          float                    alpha,
-                                                          float                    beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info,
-                                                          const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
-    _output                   = output;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = _input2 != nullptr;
-    _broadcast_bias           = gemm_info.broadcast_bias;
-    _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _has_pad_y                = gemm_info.has_pad_y;
-
-    auto padding_info = get_padding_info({ input0, input1, output });
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
-    // These variables are used only if gemm_info.has_pad_y == true
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % internal_m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
-    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    if(_has_pad_y)
-    {
-        build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    }
-
-    std::string kernel_name("gemm_mm_reshaped_only_rhs_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_has_pad_y ? "" : "no_pad_y_");
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
-                                                           const GEMMLHSMatrixInfo &lhs_info,
-                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              input2 != nullptr ? input2->clone().get() : nullptr,
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
-    const size_t rhs_idx_batch_size = 2u;
-    const size_t bia_idx_batch_size = 2u;
-    const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    // Get cross plane pads
-    const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom;
-    const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom;
-
-    // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor
-    ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
-
-    cl::Image2D input1_image2d;
-
-    if(_export_to_cl_image)
-    {
-        const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
-        const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
-
-        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, _input1->info()->data_type(), image_row_pitch);
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-
-        // LHS buffer
-        add_2D_tensor_argument(idx, _input0, slice);
-
-        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
-        {
-            _kernel.setArg(idx++, input1_image2d);
-        }
-        else
-        {
-            add_2D_tensor_argument(idx, _input1, slice_b);
-        }
-
-        // Bias buffer (_add_bias == true)
-        add_2D_tensor_argument_if(_add_bias, idx, _input2, slice);
-
-        // Output buffer
-        add_2D_tensor_argument(idx, _output, slice);
-
-        // LHS stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[lhs_idx_batch_size]));
-
-        // RHS stride_z (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[rhs_idx_batch_size]));
-
-        // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[bia_idx_batch_size]));
-        }
-
-        // Output stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[out_idx_batch_size]));
-
-        // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d && _has_pad_y)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
-        }
-
-        // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d && _has_pad_y)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
-        }
-
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index 5b96679a46..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
-     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
-     *                       The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.k0: 2,3,4,8,16
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.transpose: true,false
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
-     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
-     *                             The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.k0: 2,3,4,8,16
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.transpose: true,false
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
-     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
-     *                      The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info for the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.transpose: true,false
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-    bool             _export_to_cl_image;
-    bool             _has_pad_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
deleted file mode 100644
index 52510075b7..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
-    const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
-    bool               window_changed                      = false;
-
-    TensorInfo tmp_info(*input);
-
-    if(reinterpret_input_as_3d)
-    {
-        // Since the input tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(input->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d)));
-
-    // Configure window
-    Window win    = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic input_access(input, 0, 0,
-                                    input->dimension(0),
-                                    input->dimension(1));
-    AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-
-    window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win, output_access);     // window used to update the padding requirements of output tensor
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMReshapeLHSMatrixKernel::CLGEMMReshapeLHSMatrixKernel()
-    : _input(nullptr), _output(nullptr), _reinterpret_input_as_3d(false)
-{
-}
-
-void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, lhs_info, reinterpret_input_as_3d);
-}
-
-void CLGEMMReshapeLHSMatrixKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), lhs_info, reinterpret_input_as_3d));
-
-    auto padding_info = get_padding_info({ input });
-
-    _input                   = input;
-    _output                  = output;
-    _reinterpret_input_as_3d = reinterpret_input_as_3d;
-
-    const unsigned int src_w           = input->info()->dimension(0);
-    const unsigned int src_h           = _reinterpret_input_as_3d ? input->info()->dimension(1) * input->info()->dimension(2) : input->info()->dimension(1);
-    const unsigned int partial_load_m0 = src_h % lhs_info.m0;
-    const unsigned int partial_load_k0 = src_w % lhs_info.k0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h));
-    build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-    build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0));
-    build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0));
-
-    std::string kernel_name("gemm_reshape_lhs_matrix_");
-    kernel_name += lhs_info.transpose ? "t" : "nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), lhs_info, reinterpret_input_as_3d);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "gemm_reshape_lhs_matrix_";
-    _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.transpose);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMReshapeLHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, lhs_info, reinterpret_input_as_3d));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), lhs_info, reinterpret_input_as_3d).first);
-
-    return Status{};
-}
-
-void CLGEMMReshapeLHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_3D();
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 2 * num_arguments_per_3D_tensor();
-        const unsigned int total_cross_plane_pad = _input->info()->padding().top + _input->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
deleted file mode 100644
index 92202a26fc..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
- *  In particular, this function splits the input matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
- *  stores each one in the output matrix unrolling the values
- */
-class CLGEMMReshapeLHSMatrixKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMReshapeLHSMatrixKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeLHSMatrixKernel(const CLGEMMReshapeLHSMatrixKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeLHSMatrixKernel &operator=(const CLGEMMReshapeLHSMatrixKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeLHSMatrixKernel(CLGEMMReshapeLHSMatrixKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeLHSMatrixKernel &operator=(CLGEMMReshapeLHSMatrixKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                   Input tensor. Data types supported: All
-     * @param[out] output                  Output tensor. Data type supported: same as @p input
-     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                     information to reshape the input tensor. Only the following values are supported:
-     *                                     lhs_info.m0: 2,3,4,5,6,7,8
-     *                                     lhs_info.k0: 2,3,4,8,16
-     *                                     lhs_info.v0: greater than 0
-     *                                     lhs_info.transpose: true, false
-     *                                     lhs_info.interleave: true, false
-     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor. Data types supported: All
-     * @param[out] output                  Output tensor. Data type supported: same as @p input
-     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                     information to reshape the input tensor. Only the following values are supported:
-     *                                     lhs_info.m0: 2,3,4,5,6,7,8
-     *                                     lhs_info.k0: 2,3,4,8,16
-     *                                     lhs_info.v0: greater than 0
-     *                                     lhs_info.transpose: true, false
-     *                                     lhs_info.interleave: true, false
-     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeLHSMatrixKernel
-     *
-     * @param[in] input                   Input tensor info. Data types supported: All
-     * @param[in] output                  Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     * @param[in] lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                    information to reshape the input tensor. Only the following values are supported:
-     *                                    lhs_info.m0: 2,3,4,5,6,7,8
-     *                                    lhs_info.k0: 2,3,4,8,16
-     *                                    lhs_info.v0: greater than 0
-     *                                    lhs_info.transpose: true, false
-     *                                    lhs_info.interleave: true, false
-     * @param[in] reinterpret_input_as_3d True if the input has to be reinterpreted as 3D tensor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    bool             _reinterpret_input_as_3d;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
deleted file mode 100644
index 33de61ed01..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/gemm/CLGEMMHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    if(rhs_info.export_to_cl_image)
-    {
-        const TensorInfo tensor_reshaped_info(compute_rhs_reshaped_shape(*input, rhs_info), 1, input->data_type());
-        ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_rhs_reshaped_shape(*input, rhs_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
-    const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
-    const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
-    bool               window_changed                      = false;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*input, rhs_info)));
-
-    // Configure window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    AccessWindowStatic    output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-
-    window_changed = update_window_and_padding(win, input_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
-    if(rhs_info.export_to_cl_image)
-    {
-        arm_compute::cl_gemm::update_padding_for_cl_image(output);
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMReshapeRHSMatrixKernel::CLGEMMReshapeRHSMatrixKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLGEMMReshapeRHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, rhs_info);
-}
-
-void CLGEMMReshapeRHSMatrixKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), rhs_info));
-
-    _input  = input;
-    _output = output;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
-    build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-
-    std::string kernel_name("gemm_reshape_rhs_matrix_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), rhs_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLGEMMReshapeRHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, rhs_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), rhs_info).first);
-
-    return Status{};
-}
-
-void CLGEMMReshapeRHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
deleted file mode 100644
index 911484ea76..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
- *  In particular, this kernel splits the input matrix in blocks of size K0xN0 and stores each one in
- *  the output matrix unrolling the values */
-class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMReshapeRHSMatrixKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernel(const CLGEMMReshapeRHSMatrixKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernel &operator=(const CLGEMMReshapeRHSMatrixKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeRHSMatrixKernel(CLGEMMReshapeRHSMatrixKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
-    /** Default destructor */
-    ~CLGEMMReshapeRHSMatrixKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
-     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32, F16
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in]  input    Input tensor. Data types supported: All
-     * @param[out] output   Output tensor. Data type supported: same as @p input
-     * @param[in]  rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                      information to reshape the input tensor. Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                      rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                      rhs_info.h0: greater than 0
-     *                      rhs_info.transpose: true, false
-     *                      rhs_info.interleave: true, false
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
-     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32, F16
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All
-     * @param[out] output          Output tensor. Data type supported: same as @p input
-     * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                             information to reshape the input tensor. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                             rhs_info.h0: greater than 0
-     *                             rhs_info.transpose: true, false
-     *                             rhs_info.interleave: true, false
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
-     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32, F16
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input    Input tensor info. Data types supported: All
-     * @param[in] output   Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                     information to reshape the input tensor. Only the following values are supported:
-     *                     rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                     rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false),(only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                     rhs_info.h0: greater than 0
-     *                     rhs_info.transpose: true, false
-     *                     rhs_info.interleave: true, false
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index e33bc7afd7..904bb07282 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -34,20 +36,22 @@ namespace arm_compute
 {
 namespace
 {
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
-    ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() + indices->num_dimensions() - 1) > 4);
+
     ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), actual_axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -56,26 +60,27 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
     // Output auto initialization if not yet initialized
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
     auto_init_if_empty((*output), output_shape, 1, input->data_type());
 
     // Create window
     Window win = calculate_max_window(*output, Steps());
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     return std::make_pair(Status{}, win);
 }
 
 } // namespace
 
-CLGatherKernel::CLGatherKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
@@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               const ICLTensor        *indices,
+                               ICLTensor              *output,
+                               int                     axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-    auto padding_info = get_padding_info({ input, output, indices });
+    auto padding_info = get_padding_info({input, output, indices});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
 
     // Configure kernel window
@@ -100,10 +109,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Set build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DINDICES_DIMS=" + support::cpp11::to_string(indices->info()->num_dimensions()));
     build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+    build_opts.add_option("-DINDEX_LIMIT=" + support::cpp11::to_string(input->info()->tensor_shape()[_axis]));
 
     // Create kernel
     _kernel = create_kernel(compile_context, "gather", build_opts.options());
@@ -111,10 +122,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
     return Status{};
 }
 
@@ -126,7 +139,7 @@ void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
     Window       window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     unsigned int idx              = 0;
     add_4D_tensor_argument(idx, _input, window_collapsed);
-    add_1D_tensor_argument(idx, _indices, window_collapsed);
+    add_4D_tensor_argument(idx, _indices, window_collapsed);
     add_4D_tensor_argument(idx, _output, window_collapsed);
     enqueue(queue, *this, window_collapsed, lws_hint());
 }
diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
index 8f472a4696..db4b49d2f5 100644
--- a/src/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
@@ -74,7 +79,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 988bb39d88..b9ff72b928 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -48,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         size_t feature_height = info.feat_height();
         size_t feature_width  = info.feat_width();
@@ -58,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -67,20 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
 }
 } // namespace
 
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
-    : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
 }
 
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext   &compile_context,
+                                          const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
-    auto padding_info = get_padding_info({ anchors, all_anchors });
+    auto padding_info = get_padding_info({anchors, all_anchors});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
 
     // Metadata
@@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors     = anchors;
@@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
     build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
         build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     }
 
     // Create kernel
-    const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+    const std::string kernel_name =
+        (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
     // This means we don't need to pad on the X dimension, as we know in advance how many fields
@@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index d26795ac7d..e08f281d6c 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *anchors,
+                   ICLTensor                *all_anchors,
+                   const ComputeAnchorsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
      *
@@ -81,5 +84,5 @@ private:
     const ICLTensor *_anchors;
     ICLTensor       *_all_anchors;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
deleted file mode 100644
index 07309de83c..0000000000
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-struct Im2ColConfiguration
-{
-    std::string           kernel_name{};
-    std::set<std::string> build_options{};
-    unsigned int          num_elems_processed_per_iteration{};
-    bool                  is_padding_required_nchw{};
-};
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
-{
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0);
-
-    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
-    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
-    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
-
-    if(output->total_size() > 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
-
-    const DataLayout   data_layout  = input->data_layout();
-    const unsigned int width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int input_width  = input->dimension(width_idx);
-    const unsigned int input_height = input->dimension(height_idx);
-
-    // Configure the execute window based on the selected optimal OpenCL kernel
-    bool   window_changed = false;
-    Window win;
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-        const int xin_start = 0;
-        const int xin_end   = input->dimension(0);
-        const int yin_start = 0;
-        const int yin_end   = input->dimension(1);
-
-        const int xout_start = 0;
-        const int xout_end   = output->dimension(0);
-        const int yout_start = 0;
-        const int yout_end   = output->dimension(1);
-
-        AccessWindowStatic input_access(input, xin_start, yin_start, xin_end, yin_end);
-        AccessWindowStatic output_access(output, xout_start, yout_start, xout_end, yout_end);
-        window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
-    }
-    else
-    {
-        if(is_padding_required_nchw)
-        {
-            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-            win = calculate_max_window(*input,
-                                       Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
-            AccessWindowStatic input_access(input,
-                                            -border.left,
-                                            -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
-                                            input_height + border.bottom);
-            window_changed = window_changed || update_window_and_padding(win, input_access);
-        }
-        else
-        {
-            // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
-            // update_window_and_padding() can be skipped
-            win = calculate_max_window(*input, Steps());
-        }
-    }
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
-    win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    const DataLayout   data_layout   = input->data_layout();
-    const DataType     data_type     = input->data_type();
-    const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int input_width   = input->dimension(width_idx);
-    const unsigned int input_height  = input->dimension(height_idx);
-    const unsigned int input_channel = input->dimension(channel_idx);
-
-    const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-
-    // Im2Col configuration
-    std::string                   kernel_name = "im2col_generic_";
-    CLBuildOptions                build_opts;
-    unsigned int                  num_elems_processed_per_iteration = 1;
-    bool                          is_padding_required_nchw          = false;
-    const UniformQuantizationInfo qinfo                             = input->quantization_info().uniform();
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->element_size()));
-    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
-    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
-    build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
-    build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
-    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
-    build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
-    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
-    build_opts.add_option_if(has_bias, "-DHAS_BIAS");
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        num_elems_processed_per_iteration = std::min(2U, input_channel);
-        is_padding_required_nchw          = false;
-
-        // Only the 3x3 and 9x9 cases are optimized for NHWC
-        if(kernel_dims == Size2D(3U, 3U))
-        {
-            kernel_name = "im2col3x3_";
-        }
-        else if(kernel_dims == Size2D(9U, 9U))
-        {
-            kernel_name = "im2col9x9_";
-        }
-
-        // Get boundary vector (the first/last vector with potentially a partial vector size) size
-        // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size
-        // otherwise, the boundary vec size is the (partial) remainder vector size
-        const unsigned int vec_size          = num_elems_processed_per_iteration;
-        const unsigned int partial_vec_size  = input_channel % vec_size;
-        const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size);
-        build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size));
-        build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size));
-    }
-    else
-    {
-        if(dilation == Size2D(1U, 1U))
-        {
-            const bool squared_im2col = kernel_dims.width == kernel_dims.height;
-            if(squared_im2col)
-            {
-                // Check if we can run an optimized im2col for NCHW
-                switch(kernel_dims.width)
-                {
-                    case 1:
-                        // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
-                        {
-                            kernel_name                       = "im2col1x1_stridex1_";
-                            num_elems_processed_per_iteration = 4;
-                            is_padding_required_nchw          = true;
-                        }
-                        break;
-                    case 3:
-                        kernel_name                       = "im2col3x3_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = true;
-                        break;
-                    case 5:
-                        kernel_name                       = "im2col5x5_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = true;
-                        break;
-                    case 11:
-                        // Optimized im2col11x11 if pad_x = pad_y = 0
-                        if(!conv_info.has_padding())
-                        {
-                            kernel_name                       = "im2col11x11_padx0_pady0_";
-                            num_elems_processed_per_iteration = 1;
-                            is_padding_required_nchw          = true;
-                        }
-                        break;
-                    default:
-                        kernel_name                       = "im2col_generic_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = false;
-                        break;
-                }
-            }
-            else if(kernel_dims.width > 1 && !conv_info.has_padding())
-            {
-                kernel_name                       = "im2col_generic_padx0_pady0_";
-                num_elems_processed_per_iteration = 1;
-                is_padding_required_nchw          = false;
-
-                // Optimized im2col is performed using one or more vector operations with the specified vector size
-                // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
-                // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
-                // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
-                // Using the vector size of 8, however, may be faster.
-                // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
-                // is used instead.)
-                const size_t vector_size           = std::min(static_cast<size_t>(4), kernel_dims.width);
-                const size_t width_mod_vector_size = kernel_dims.width % vector_size;
-                build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-                build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
-            }
-        }
-    }
-
-    // Append the data layout to the kernel_name
-    kernel_name += lower_string(string_from_data_layout(data_layout));
-
-    Im2ColConfiguration im2col_config;
-    im2col_config.kernel_name                       = kernel_name;
-    im2col_config.build_options                     = build_opts.options();
-    im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
-    im2col_config.is_padding_required_nchw          = is_padding_required_nchw;
-
-    return im2col_config;
-}
-} // namespace
-
-CLIm2ColKernel::CLIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
-{
-}
-
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                               unsigned int num_groups)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
-}
-
-void CLIm2ColKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                               const Size2D &dilation,
-                               unsigned int  num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
-
-    auto padding_info = get_padding_info({ input, output });
-    _data_layout      = input->info()->data_layout();
-
-    const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int input_width  = input->info()->dimension(width_idx);
-    const unsigned int input_height = input->info()->dimension(height_idx);
-
-    // Select and configure the optimal OpenCL kernel to run.
-    // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
-    // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation, num_groups);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
-
-    _input                             = input;
-    _output                            = output;
-    _convolved_dims                    = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
-    _kernel_dims                       = kernel_dims; // Only needed by the Tuner
-    _conv_info                         = conv_info;   // Only needed by the Tuner
-    _num_groups                        = num_groups;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                    im2col_config.is_padding_required_nchw, num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = im2col_config.kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(num_groups);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-
-    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
-}
-
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation, num_groups);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                              im2col_config.is_padding_required_nchw, num_groups)
-                                .first);
-    return Status{};
-}
-
-void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    // Get initial windows
-    // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    window_collapsed.set_dimension_step(Window::DimZ, 1);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    const Window first_slice_3d = window_collapsed.first_slice_window_3D();
-
-    Window slice     = first_slice_3d;
-    Window slice_in  = first_slice_3d;
-    Window slice_out = window_output.first_slice_window_2D();
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
-        const int    num_batches = tmp_win[3].end();
-
-        slice.set(1, Window::Dimension(0, static_cast<int>(_output->info()->tensor_shape()[1]), 1));
-        slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
-    }
-    else
-    {
-        slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
-        slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
-        // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
-    }
-
-    // Setup input slice
-    // The dimensions of the input are increased within the OpenCL kernel
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Setup output slice
-    // The dimensions of the output are increased within the OpenCL kernel
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        if(_num_groups == 1)
-        {
-            add_2D_tensor_argument(idx, _output, slice_out);
-        }
-        else
-        {
-            add_3D_tensor_argument(idx, _output, slice_out);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLIm2ColKernel.h b/src/core/CL/kernels/CLIm2ColKernel.h
deleted file mode 100644
index 2920c7d138..0000000000
--- a/src/core/CL/kernels/CLIm2ColKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLIM2COLKERNEL_H
-#define ARM_COMPUTE_CLIM2COLKERNEL_H
-
-#include "arm_compute/core/Size2D.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class CLIm2ColKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel(const CLIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel(CLIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                         while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     *                         This is valid only for non-quantized inputs.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                         Number of groups other than 1 is only supported for NCHW data layout.
-     *                         Number of groups should be multiple to the number of channels.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                   unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                             while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims     The kernel dimensions (width and height).
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                        while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     *                        This is valid only for non-quantized inputs.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                        Number of groups other than 1 is only supported for NCHW data layout.
-     *                        Number of groups should be multiple to the number of channels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLIM2COLKERNEL_H */
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index f9e1cbec27..b13eb16556 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,65 +29,154 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo                          *input,
+                          const ITensorInfo                          *output,
+                          const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+
+    if (output != nullptr && output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+                                      ICLTensor              *input,
+                                      ICLTensor              *output,
+                                      bool                    use_mixed_precision)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    auto padding_info = get_padding_info({input, output});
+
+    _input  = input;
+    _output = output == nullptr ? input : output;
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_meanvar(_input->info(), _output->info()));
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+                          (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
+    // Create kernel
+    _kernel = create_kernel(compile_context, "compute_mean_var", build_opts.options());
+
     // We handle the planes manually
-    Window win = calculate_max_window(*input, Steps(1));
+    Window             win           = calculate_max_window(*(input->info()), Steps(1));
+    const auto         data_layout   = input->info()->data_layout();
+    const unsigned int channel_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int batches_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    const unsigned int input_channel = input->info()->dimension(channel_idx);
+    const unsigned int input_batches = input->info()->dimension(batches_idx);
+    const TensorShape  out_shape(input_channel, 2u, input_batches);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+    if (use_mixed_precision)
+    {
+        auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
+    }
+    else
+    {
+        auto_init_if_empty(*_output->info(), out_shape, 1, input->info()->data_type());
+    }
+    ICLKernel::configure_internal(win);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
 
-    // CLInstanceNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(output->num_dimensions());
-    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-    return std::make_pair(Status{}, win);
+Status CLComputeMeanVariance::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_meanvar(input, output));
+    return Status{};
 }
-} // namespace
 
-CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
 {
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window collapsed_window = window.collapse(window, Window::DimZ);
+
+    // We will process the planes together
+    if (_input->info()->data_layout() == DataLayout::NCHW)
+    {
+        collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+        collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    }
+    else
+    {
+        collapsed_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        collapsed_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(3), 1));
+    }
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, collapsed_window);
+    add_3D_tensor_argument(idx, _output, collapsed_window);
+
+    enqueue(queue, *this, collapsed_window, lws_hint());
 }
 
-void CLInstanceNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _run_in_place(false)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext                     &compile_context,
+                                                   ICLTensor                                  *input,
+                                                   ICLTensor                                  *mean_var,
+                                                   ICLTensor                                  *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
+    _mean   = mean_var;
 
     _run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), info));
@@ -95,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+                                                         ? "float"
+                                                         : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -110,16 +201,21 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
     _kernel = create_kernel(compile_context, "instance_normalization", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    Window win = calculate_max_window(*input->info(), Steps(1));
+    if (output != nullptr)
+    {
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
+    }
+
+    ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
     return Status{};
 }
 
@@ -131,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -144,7 +240,9 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, _input, collapsed_window);
-    if(!_run_in_place)
+    add_3D_tensor_argument(idx, _mean, collapsed_window);
+
+    if (!_run_in_place)
     {
         add_4D_tensor_argument(idx, _output, collapsed_window);
     }
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index d4444f0b20..9f436da7f6 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -52,21 +52,18 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in, out] input  Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
-     *                        In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      info   Kernel meta-data descriptor
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
-    /** Set the input and output tensors.
-     *
      * @param[in]      compile_context The compile context to be used.
      * @param[in, out] input           Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
      *                                 In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[in]      mean_var        Tensor containing the precomputed mean and variance values. Data types supported: F32.
      * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      info            Kernel meta-data descriptor
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+    void configure(const CLCompileContext                     &compile_context,
+                   ICLTensor                                  *input,
+                   ICLTensor                                  *mean_var,
+                   ICLTensor                                  *output,
+                   const InstanceNormalizationLayerKernelInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -76,7 +73,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -84,7 +82,53 @@ public:
 private:
     ICLTensor *_input;
     ICLTensor *_output;
+    ICLTensor *_mean;
     bool       _run_in_place;
 };
+
+/** Interface for compute Mean and Variance per channel */
+class CLComputeMeanVariance : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLComputeMeanVariance();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComputeMeanVariance(const CLComputeMeanVariance &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComputeMeanVariance &operator=(const CLComputeMeanVariance &) = delete;
+    /** Default Move Constructor. */
+    CLComputeMeanVariance(CLComputeMeanVariance &&) = default;
+    /** Default move assignment operator */
+    CLComputeMeanVariance &operator=(CLComputeMeanVariance &&) = default;
+    /** Default destructor */
+    ~CLComputeMeanVariance() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]      compile_context     The compile context to be used.
+     * @param[in, out] input               Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
+     *                                     In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[out]     output              Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      use_mixed_precision Use mixed precision in case of FP16 execution
+     */
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
+     *
+     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_input;
+    ICLTensor *_output;
+};
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 9e91d98f7c..9ed9d7c5b0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -42,9 +43,8 @@ namespace
 {
 constexpr int max_input_tensor_dim = 3;
 
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -54,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -71,40 +72,30 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
 
     return Status{};
 }
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
-    return std::make_tuple(err, win);
-}
 } // namespace
 
 CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
     : _input(nullptr), _sum(nullptr), _output(nullptr), _actual_axis(0), _epsilon(1e-12)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+    const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *sum,
+                                         ICLTensor              *output,
+                                         int                     axis,
+                                         float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
+    auto padding_info = get_padding_info({input, sum, output});
 
     _input       = input;
     _sum         = sum;
@@ -112,35 +103,40 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     _actual_axis = wrap_around(axis, max_input_tensor_dim);
     _epsilon     = epsilon;
 
+    const unsigned int vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
 
     // Create kernel
     std::string  kernel_name;
     unsigned int idx = 0;
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
-            kernel_name = "x";
+            kernel_name = "l2_normalize_x";
             idx         = num_arguments_per_2D_tensor() * 3;
             break;
         case 1:
-            kernel_name = "y";
+            kernel_name = "l2_normalize_y";
             idx         = num_arguments_per_2D_tensor() * 3;
             break;
         case 2:
-            kernel_name = "z";
+            kernel_name = "l2_normalize_z";
             idx         = num_arguments_per_3D_tensor() * 3;
             break;
         default:
             ARM_COMPUTE_ERROR("Axis not supported");
     }
-    _kernel = create_kernel(compile_context, "l2_normalize_" + kernel_name, build_opts);
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set epsilon argument
-    if(input->info()->data_type() == DataType::F32)
+    if (input->info()->data_type() == DataType::F32)
     {
         _kernel.setArg<cl_float>(idx, _epsilon);
     }
@@ -150,17 +146,19 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    Window win = calculate_max_window(*input->info(), Steps(vec_size_x));
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
 
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    ICLKernel::configure_internal(win);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
     return Status{};
 }
 
@@ -171,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
 
     Window window_sum(window);
 
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
         {
@@ -185,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 1:
@@ -201,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 2:
@@ -217,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_3D_tensor_argument(idx, _sum, sum_slice);
                 add_3D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+            } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index edc0585217..5c9ab94ce5 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,12 @@ public:
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *sum,
+                   ICLTensor              *output,
+                   int                     axis,
+                   float                   epsilon);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
      *
@@ -84,7 +90,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
deleted file mode 100644
index a439c2448e..0000000000
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLLKTrackerKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cmath>
-
-using namespace arm_compute;
-
-void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                                      ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                                      bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), old_points, new_points_estimates, old_points_internal, new_points_internal, use_initial_estimate, level, num_levels, pyramid_scale);
-}
-
-void CLLKTrackerInitKernel::configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                                      ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                                      bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
-
-{
-    ARM_COMPUTE_ERROR_ON(old_points == nullptr);
-    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
-    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
-
-    const float scale = std::pow(pyramid_scale, level);
-
-    // Create kernel
-    std::string kernel_name = "init_level";
-    if(level == (num_levels - 1))
-    {
-        kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max");
-    }
-    _kernel = create_kernel(compile_context, kernel_name);
-
-    // Set static kernel arguments
-    unsigned int idx = 0;
-    if(level == (num_levels - 1))
-    {
-        _kernel.setArg(idx++, old_points->cl_buffer());
-        if(use_initial_estimate)
-        {
-            _kernel.setArg(idx++, new_points_estimates->cl_buffer());
-        }
-    }
-    _kernel.setArg(idx++, old_points_internal->cl_buffer());
-    _kernel.setArg(idx++, new_points_internal->cl_buffer());
-    _kernel.setArg<cl_float>(idx++, scale);
-
-    // Configure kernel window
-    Window window;
-    window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1));
-    window.set(Window::DimY, Window::Dimension(0, 1, 1));
-    ICLKernel::configure_internal(window);
-}
-
-void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    enqueue(queue, *this, window, lws_hint());
-}
-
-void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), new_points_internal, new_points);
-}
-
-void CLLKTrackerFinalizeKernel::configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
-
-{
-    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
-    ARM_COMPUTE_ERROR_ON(new_points == nullptr);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "finalize");
-
-    // Set static kernel arguments
-    unsigned int idx = 0;
-    _kernel.setArg(idx++, new_points_internal->cl_buffer());
-    _kernel.setArg(idx++, new_points->cl_buffer());
-
-    // Configure kernel window
-    Window window;
-    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
-    window.set(Window::DimY, Window::Dimension(0, 1, 1));
-    ICLKernel::configure_internal(window);
-}
-
-void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    enqueue(queue, *this, window, lws_hint());
-}
-
-CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel()
-    : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr)
-{
-}
-
-void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                                        ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                                        ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                                        size_t window_dimension, size_t level)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), old_input, old_scharr_gx, old_scharr_gy, old_points_internal, new_points_internal, coeff_table, old_ival, window_dimension, level);
-}
-
-void CLLKTrackerStage0Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                                        ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                                        ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                                        size_t window_dimension, size_t level)
-
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
-    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
-    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
-    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
-
-    _old_input     = old_input;
-    _old_scharr_gx = old_scharr_gx;
-    _old_scharr_gy = old_scharr_gy;
-
-    // Configure kernel window
-    Window window;
-    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
-    window.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const ValidRegion valid_region = intersect_valid_regions(
-                                         old_input->info()->valid_region(),
-                                         old_scharr_gx->info()->valid_region(),
-                                         old_scharr_gy->info()->valid_region());
-
-    update_window_and_padding(window,
-                              AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1),
-                                                 valid_region.end(0), valid_region.end(1)),
-                              AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1),
-                                                 valid_region.end(0), valid_region.end(1)),
-                              AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1),
-                                                 valid_region.end(0), valid_region.end(1)));
-
-    ICLKernel::configure_internal(window);
-
-    // Initialize required variables
-    const int       level0              = (level == 0) ? 1 : 0;
-    const int       window_size         = window_dimension;
-    const int       window_size_squared = window_dimension * window_dimension;
-    const int       window_size_half    = window_dimension / 2;
-    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
-    const cl_float3 border_limits =
-    {
-        {
-            // -1 because we load 2 values at once for bilinear interpolation
-            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
-            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
-            static_cast<cl_float>(valid_region.start(0))
-        }
-    };
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "lktracker_stage0");
-
-    // Set arguments
-    unsigned int idx = 3 * num_arguments_per_2D_tensor();
-    _kernel.setArg(idx++, old_points_internal->cl_buffer());
-    _kernel.setArg(idx++, new_points_internal->cl_buffer());
-    _kernel.setArg(idx++, coeff_table->cl_buffer());
-    _kernel.setArg(idx++, old_ival->cl_buffer());
-    _kernel.setArg<cl_int>(idx++, window_size);
-    _kernel.setArg<cl_int>(idx++, window_size_squared);
-    _kernel.setArg<cl_int>(idx++, window_size_half);
-    _kernel.setArg<cl_float3>(idx++, border_limits);
-    _kernel.setArg<cl_float>(idx++, eig_const);
-    _kernel.setArg<cl_int>(idx++, level0);
-}
-
-void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Set static tensor arguments. Setting here as allocation might be deferred.
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _old_input, window);
-    add_2D_tensor_argument(idx, _old_scharr_gx, window);
-    add_2D_tensor_argument(idx, _old_scharr_gy, window);
-
-    enqueue(queue, *this, window, lws_hint());
-}
-
-CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel()
-    : _new_input(nullptr)
-{
-}
-
-void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                                        Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), new_input, new_points_internal, coeff_table, old_ival, termination, epsilon, num_iterations, window_dimension, level);
-}
-
-void CLLKTrackerStage1Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table,
-                                        ICLOldValArray *old_ival,
-                                        Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
-
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
-    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
-    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
-
-    _new_input = new_input;
-
-    // Configure kernel window
-    Window window;
-    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
-    window.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const ValidRegion &valid_region = new_input->info()->valid_region();
-
-    update_window_and_padding(window,
-                              AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1),
-                                                 valid_region.end(0), valid_region.end(1)));
-
-    ICLKernel::configure_internal(window);
-
-    // Initialize required variables
-    const int       level0              = (level == 0) ? 1 : 0;
-    const int       window_size         = window_dimension;
-    const int       window_size_squared = window_dimension * window_dimension;
-    const int       window_size_half    = window_dimension / 2;
-    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
-    const cl_float3 border_limits =
-    {
-        {
-            // -1 because we load 2 values at once for bilinear interpolation
-            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
-            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
-            static_cast<cl_float>(valid_region.start(0))
-        }
-    };
-
-    // Set maximum number of iterations used for convergence
-    const size_t max_iterations = 1000;
-    num_iterations              = (termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : num_iterations;
-
-    const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "lktracker_stage1");
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor();
-    _kernel.setArg(idx++, new_points_internal->cl_buffer());
-    _kernel.setArg(idx++, coeff_table->cl_buffer());
-    _kernel.setArg(idx++, old_ival->cl_buffer());
-    _kernel.setArg<cl_int>(idx++, window_size);
-    _kernel.setArg<cl_int>(idx++, window_size_squared);
-    _kernel.setArg<cl_int>(idx++, window_size_half);
-    _kernel.setArg<cl_int>(idx++, num_iterations);
-    _kernel.setArg<cl_float>(idx++, epsilon);
-    _kernel.setArg<cl_float3>(idx++, border_limits);
-    _kernel.setArg<cl_float>(idx++, eig_const);
-    _kernel.setArg<cl_int>(idx++, level0);
-    _kernel.setArg<cl_int>(idx++, term_epsilon);
-}
-
-void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Set static tensor arguments. Setting here as allocation might be deferred.
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _new_input, window);
-
-    enqueue(queue, *this, window, lws_hint());
-}
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.h b/src/core/CL/kernels/CLLKTrackerKernel.h
deleted file mode 100644
index 2d2966854a..0000000000
--- a/src/core/CL/kernels/CLLKTrackerKernel.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLKTRACKERKERNEL_H
-#define ARM_COMPUTE_CLLKTRACKERKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to run the initialization step of LKTracker */
-class CLLKTrackerInitKernel : public ICLKernel
-{
-public:
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
-     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
-     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
-     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]  level                The pyramid level
-     * @param[in]  num_levels           The number of pyramid levels
-     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  compile_context      The compile context to be used.
-     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
-     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
-     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
-     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]  level                The pyramid level
-     * @param[in]  num_levels           The number of pyramid levels
-     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the finalize step of LKTracker, where it truncates the coordinates stored in new_points array */
-class CLLKTrackerFinalizeKernel : public ICLKernel
-{
-public:
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
-     */
-    void configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
-     */
-    void configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the first stage of LKTracker, where A11, A12, A22, min_eig, ival, ixval and iyval are computed */
-class CLLKTrackerStage0Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLKTrackerStage0Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default;
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
-     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
-     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
-     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
-     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[out]     old_ival            Pointer to the array holding internal values
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   size_t window_dimension, size_t level);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      compile_context     The compile context to be used.
-     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
-     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
-     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
-     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
-     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[out]     old_ival            Pointer to the array holding internal values
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   size_t window_dimension, size_t level);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_old_input;
-    const ICLTensor *_old_scharr_gx;
-    const ICLTensor *_old_scharr_gy;
-};
-
-/** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */
-class CLLKTrackerStage1Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLKTrackerStage1Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default;
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
-     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[in]      old_ival            Pointer to the array holding internal values
-     * @param[in]      termination         The criteria to terminate the search of each keypoint.
-     * @param[in]      epsilon             The error for terminating the algorithm
-     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      compile_context     The compile context to be used.
-     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
-     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[in]      old_ival            Pointer to the array holding internal values
-     * @param[in]      termination         The criteria to terminate the search of each keypoint.
-     * @param[in]      epsilon             The error for terminating the algorithm
-     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_new_input;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLLKTRACKERKERNEL_H */
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index d0ac12dcd7..e560f1de4a 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,9 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,26 +43,31 @@ using namespace misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *input,
+                          const ITensorInfo      *output,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -72,16 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _indices(nullptr)
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
+    _type = CLKernelType::POOL;
 }
 
-void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *indices,
+                                          ICLTensor              *output,
+                                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
-    auto padding_info = get_padding_info({ input, indices, output });
+    auto padding_info = get_padding_info({input, indices, output});
 
     _input   = input;
     _output  = output;
@@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo      *input,
+                                           const ITensorInfo      *indices,
+                                           const ITensorInfo      *output,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
@@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice);
         add_3D_tensor_argument(idx, _indices, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index cc96cf1a1f..eb18a46784 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -53,26 +53,33 @@ public:
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
-     *                             @ref opencl::ClPooling with indices should precede this function in order to
+     *                             @ref CLPoolingLayer with indices should precede this function in order to
      *                             properly reconstruct the output tensor.
      *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
-     *                      @ref opencl::ClPooling with indices should precede this function in order to
+     *                      @ref CLPoolingLayer with indices should precede this function in order to
      *                      properly reconstruct the output tensor.
      *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index a889df7930..8632bdf623 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,9 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,39 +50,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    if(output != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output, *input);
-    }
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    // This kernel doesn't need padding
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    if(output != nullptr)
-    {
-        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    return std::make_pair(Status{}, win);
-}
 } // namespace
 
 CLMeanStdDevNormalizationKernel::CLMeanStdDevNormalizationKernel()
     : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *output, float epsilon)
@@ -87,18 +70,28 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     _run_in_place = (output == nullptr) || (output == input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+
+    if (output != nullptr)
+    {
+        auto_init_if_empty(*output->info(), *input->info());
+    }
 
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
@@ -106,15 +99,15 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
     build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DMEANSTDNORM_HALF");
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
 
     // Create kernel
     _kernel = create_kernel(compile_context, "mean_stddev_normalization", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
     _config_id = "mean_stddev_normalization_layer_";
@@ -128,7 +121,6 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
 Status CLMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
     return Status{};
 }
 
@@ -148,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index a1ba2b905e..e02a3c58a3 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -66,7 +66,10 @@ public:
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
deleted file mode 100644
index 7017efa3c2..0000000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        TensorShape output_shape = compute_min_max_shape(input);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    TensorShape output_shape = compute_min_max_shape(input);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
-    const unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output, 0, 0, 2, output->dimension(1));
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
-}
-} // namespace
-
-CLMinMaxLayerKernel::CLMinMaxLayerKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "minmax_layer", build_opts);
-
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
-}
-
-Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
-    return Status{};
-}
-
-void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
-{
-    _output->map(queue, true);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator output(_output, window_output);
-
-    // Reset output
-    execute_window_loop(window_output, [&](const Coordinates &)
-    {
-        auto *ptr = reinterpret_cast<float *>(output.ptr());
-        ptr[0]    = std::numeric_limits<float>::max();
-        ptr[1]    = std::numeric_limits<float>::min();
-    },
-    output);
-
-    _output->unmap(queue);
-}
-
-void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice            = window_collapsed.first_slice_window_3D();
-    slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window output_slice = slice.shift_dimensions(2);
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, slice);
-        add_1D_tensor_argument(idx, _output, output_slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.h b/src/core/CL/kernels/CLMinMaxLayerKernel.h
deleted file mode 100644
index aa2ff3f375..0000000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor.
- */
-class CLMinMaxLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output          Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    /** Resets global minimum and maximum
-     *
-     * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
-     */
-    void reset(cl::CommandQueue &queue);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index d1982e77b9..b636c485e7 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -37,11 +40,10 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 4;
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -52,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -62,38 +64,66 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
-    const bool         is_norm_accross_width = norm_idx == 0;
+    bool             window_changed = false;
+    Window           win;
+    const DataLayout data_layout = input->data_layout();
+    if (data_layout == DataLayout::NCHW)
+    {
+        const unsigned int vec_size_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        const unsigned int norm_idx             = get_normalization_dimension_index(input->data_layout(), norm_info);
+        const bool         is_norm_across_width = norm_idx == 0;
 
-    const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
-    const BorderSize   border_size  = BorderSize(0, border_width);
+        const unsigned int norm_radius = norm_info.norm_size() / 2;
+        // Border / padding calculation:
+        // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+        // This means the x axis is fully-padded depending on vec_size_x and norm_size
+        // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+        // In : |p|#|#|#|p|p|
+        // Out:   |#|#|#|p|
+        // The output has 1 right padding because of the vec_size_x.
+        // The input has 1 left padding because radius = 1.
+        // The input has 2 right padding because of radius = 1 AND because of the extra output padding
+        const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+        const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
 
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool   window_changed = false;
+        win = calculate_max_window(*input, Steps(vec_size_x));
 
-    // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
-    // Reads can occur within the valid region of the input
-    if(is_norm_accross_width)
-    {
-        AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
-        window_changed = window_changed || update_window_and_padding(win, input_access);
+        // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
+        // Reads can occur within the valid region of the input
+        if (is_norm_across_width)
+        {
+            AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+            window_changed = window_changed || update_window_and_padding(win, input_access);
+        }
+        else
+        {
+            AccessWindowHorizontal input_access(input, -border_size.left, vec_size_x);
+            window_changed = window_changed || update_window_and_padding(win, input_access);
+        }
+
+        AccessWindowHorizontal output_access(output, 0, vec_size_x);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
     }
     else
     {
-        AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, input_access);
+        unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        if (norm_info.is_cross_map())
+        {
+            vec_size_x = 1;
+        }
+        win = calculate_max_window(*input, Steps(vec_size_x));
     }
-
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, output_access);
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -101,6 +131,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLNormalizationLayerKernel::CLNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _border_size(0), _is_norm_across_width(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 BorderSize CLNormalizationLayerKernel::border_size() const
@@ -113,24 +144,51 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           NormalizationLayerInfo  norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
+    auto padding_info = get_padding_info({input, output});
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
+    auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input  = input;
     _output = output;
 
-    const DataLayout   data_layout  = input->info()->data_layout();
-    const unsigned int norm_idx     = get_normalization_dimension_index(data_layout, norm_info);
-    _is_norm_across_width           = norm_idx == 0;
-    const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
-    _border_size                    = BorderSize(0, border_width);
+    const DataLayout data_layout = input->info()->data_layout();
+    unsigned int     vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+    {
+        vec_size_x           = 1;
+        vec_size_x_leftovers = 0;
+    }
+
+    if (data_layout == DataLayout::NCHW)
+    {
+        const unsigned int norm_idx    = get_normalization_dimension_index(data_layout, norm_info);
+        _is_norm_across_width          = norm_idx == 0;
+        const unsigned int norm_radius = norm_info.norm_size() / 2;
+        // Border / padding calculation:
+        // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+        // This means the x axis is fully-padded depending on vec_size_x and norm_size
+        // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+        // In : |p|#|#|#|p|p|
+        // Out:   |#|#|#|p|
+        // The output has 1 right padding because of the vec_size_x.
+        // The input has 1 left padding because radius = 1.
+        // The input has 2 right padding because of radius = 1 AND the extra output padding
+        const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+        _border_size = BorderSize(0, border_width_right, 0, border_width_left);
+    }
 
     const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
 
@@ -140,35 +198,29 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
     build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
     build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
-    build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)));
+    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers)));
     build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
     build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
     build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+                             "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+                             "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
 
     // Create kernel
     std::string kernel_name;
-    if(norm_info.is_in_map())
+    if (norm_info.is_in_map())
     {
         kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
     }
     else
     {
-        if(data_layout == DataLayout::NCHW)
-        {
-            kernel_name = "normalization_layer_cross_map";
-        }
-        else
-        {
-            // 1D Cross-Map normalization in NHWC is the same as 1D In-Map normalization in NCHW
-            kernel_name = "normalization_layer_in_map_nchw";
-        }
+        kernel_name = "normalization_layer_cross_map_" + lower_string(string_from_data_layout(data_layout));
     }
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
     // Set config_id for enabling LWS tuning
@@ -182,12 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    if (data_layout == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+    }
 }
 
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo     *input,
+                                            const ITensorInfo     *output,
+                                            NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
 
     return Status{};
 }
@@ -207,6 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
index 739a2ae9f1..5517ba6904 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -63,7 +63,10 @@ public:
      *                             Data layouts supported: same as @p input.
      * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   NormalizationLayerInfo  norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -77,7 +80,7 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void       run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 18cbe217be..59352a8fb7 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,32 +29,37 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
 
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -64,11 +69,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, *input->clone());
-
     const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
 
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -77,16 +79,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
-
-    if(input->data_layout() == DataLayout::NHWC)
-    {
-        AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal std_access(std, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, mean_access, std_access);
-    }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -94,36 +89,57 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *mean,
+                                                const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *std)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
 
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+
+    auto padding_info = get_padding_info({input, output});
+
     _input  = input;
     _output = output;
     _mean   = mean;
     _std    = std;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const unsigned int channel_idx                       = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
-    const DataType     dt                                = input->info()->data_type();
+    const DataLayout data_layout = input->info()->data_layout();
+
+    // Get number of elements to process per iterations
+    const unsigned int num_elems_processed_per_iteration =
+        (data_layout == DataLayout::NHWC)
+            ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+            : (16 / input->info()->element_size());
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const DataType     dt          = input->info()->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
     build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+                           support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
     build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
 
     std::string kernel_name = "normalize_planar_yuv_layer_";
-    if(is_data_type_quantized(dt))
+    if (is_data_type_quantized(dt))
     {
         const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
         build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -132,13 +148,22 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     }
 
     // Create kernel
-    kernel_name += lower_string(string_from_data_layout(input->info()->data_layout()));
+    kernel_name += lower_string(string_from_data_layout(data_layout));
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    if (data_layout == DataLayout::NHWC)
+    {
+        Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+        ICLKernel::configure_internal(win);
+        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+    }
+    else
+    {
+        auto win_config = validate_and_configure_window_nchw(input->info(), output->info());
+        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+        ICLKernel::configure_internal(win_config.second);
+    }
 
     // Set config_id for enabling LWS tuning
     _config_id = "normalize_planar_yuv_layer_";
@@ -153,11 +178,17 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     _config_id += support::cpp11::to_string(input->info()->dimension(2));
 }
 
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *mean,
+                                                 const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get()).first);
-
+    if (input->data_layout() == DataLayout::NCHW)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+    }
     return Status{};
 }
 
@@ -182,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 6db4433e78..341b404e3d 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -67,7 +67,11 @@ public:
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                             Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -79,7 +83,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 2f54b390d5..0ac285038e 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -34,25 +36,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &padding,
+                          PixelValue         constant_value,
+                          PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
-    if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+    if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
 
         const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
-        for(size_t i = 0; i < padding.size(); ++i)
+        for (size_t i = 0; i < padding.size(); ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
         }
     }
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
@@ -64,40 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLPadLayerKernel::CLPadLayerKernel()
-    : _input(nullptr), _output(nullptr), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+    const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+                                 const ICLTensor        *input,
+                                 ICLTensor              *output,
+                                 const PaddingList      &padding,
+                                 PixelValue              constant_value,
+                                 PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input      = input;
     _output     = output;
     _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
 
     // Set build options
-    const DataType    &data_type               = input->info()->data_type();
-    const unsigned int input_width             = input->info()->dimension(0);
-    const unsigned int input_height            = input->info()->dimension(1);
-    const unsigned int input_depth             = input->info()->dimension(2);
-    const unsigned int pad_x_before            = padding.at(0).first;
-    const unsigned int pad_y_before            = padding.size() > 1 ? padding.at(1).first : 0;
-    const unsigned int pad_z_before            = padding.size() > 2 ? padding.at(2).first : 0;
-    const unsigned int vec_size                = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width);
-    const unsigned int pad_right_start         = input_width + pad_x_before;
-    const unsigned int pad_x_before_remainder  = pad_x_before % vec_size;
-    const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
+    const DataType    &data_type    = input->info()->data_type();
+    const unsigned int input_width  = input->info()->dimension(0);
+    const unsigned int input_height = input->info()->dimension(1);
+    const unsigned int input_depth  = input->info()->dimension(2);
+    const unsigned int pad_x_before = padding.at(0).first;
+    const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+    const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+    const unsigned int vec_size     = adjust_vec_size(
+            std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+            input_width);
+    const unsigned int pad_right_start        = input_width + pad_x_before;
+    const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+    const unsigned int vec_size_leftover_write =
+        vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -106,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
     build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
-    if(padding.size() > 1)
+    if (padding.size() > 1)
     {
         build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
 
-        if(padding.size() > 2)
+        if (padding.size() > 2)
         {
             build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
             build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
@@ -119,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     }
 
     std::string kernel_name = "pad_layer_";
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
             kernel_name += "constant";
 
-            const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+            const unsigned int vec_size_leftover_read =
+                vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
 
             build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
             build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
 
-            if(pad_x_before >= vec_size)
+            if (pad_x_before >= vec_size)
             {
                 build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
-                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size));
+                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+                                      support::cpp11::to_string(pad_right_start / vec_size));
             }
-            if(_4d_enabled)
+            if (_4d_enabled)
             {
                 build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
                 build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -152,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
 
             const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
             const unsigned int after_pad_fact_x      = (2 * input_width + pad_x_before) - is_reflect;
-            const unsigned int output_last_x         = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+            const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
 
             build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
             build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
-            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
-            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
             build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
-            build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+            build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                                     "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
 
             break;
         }
@@ -177,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  PixelValue         constant_value,
+                                  PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
     return Status{};
@@ -195,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        if(_4d_enabled)
+        if (_4d_enabled)
         {
             add_argument<unsigned int>(idx, batch++);
         }
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
index 90af337f94..dca121b6a1 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -56,7 +56,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Set the input and output tensor.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -68,8 +72,12 @@ public:
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -80,7 +88,11 @@ public:
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
deleted file mode 100644
index c68c526ec9..0000000000
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(overflow_policy);
-    ARM_COMPUTE_UNUSED(rounding_policy);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured output
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
-                                                             1,
-                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
-                                        "Output can only be U8 if both inputs are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
-                                        "Output can only be QASYMM8 if both inputs are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
-                                        "Output can only be QSYMM16 if both inputs are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
-                                        "Output can only be S32 if both inputs are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-    const TensorShape &out_shape    = broadcast_pair.first;
-    const ValidRegion &valid_region = broadcast_pair.second;
-
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output, out_shape);
-
-        if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output, Format::S16);
-        }
-        else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output, Format::F32);
-        }
-        else if(input1->data_type() == DataType::QASYMM8)
-        {
-            set_data_type_if_unknown(*output, DataType::QASYMM8);
-        }
-        else if(input1->data_type() == DataType::QASYMM8_SIGNED)
-        {
-            set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
-        }
-        else if(input1->data_type() == DataType::QSYMM16)
-        {
-            set_data_type_if_unknown(*output, DataType::QSYMM16);
-        }
-    }
-
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
-                                                ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
-}
-
-void CLPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
-                                                ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output,
-                                                  scale, overflow_policy, rounding_policy, act_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    int scale_int = -1;
-    // Extract sign, exponent and mantissa
-    int   exponent            = 0;
-    float normalized_mantissa = std::frexp(scale, &exponent);
-    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
-    // Moreover, it will be negative as we deal with 1/2^n
-    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
-    {
-        // Store the positive exponent. We know that we compute 1/2^n
-        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-        scale_int = std::abs(exponent - 1);
-    }
-
-    std::string acc_type;
-    // Check if it has float inputs and output
-    if(is_data_type_float(input1->data_type()) || is_data_type_float(input2->data_type()))
-    {
-        scale_int = -1;
-        acc_type  = (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) ? "float" : "half";
-    }
-    else
-    {
-        if(input1->element_size() == 2 || input2->element_size() == 2)
-        {
-            // Use 32-bit accumulator for 16-bit input
-            acc_type = "int";
-        }
-        else
-        {
-            // Use 16-bit accumulator for 8-bit input
-            acc_type = "ushort";
-        }
-    }
-
-    const bool is_quantized = is_data_type_quantized(input1->data_type());
-
-    // Set kernel build options
-    std::string    kernel_name = "pixelwise_mul";
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->data_type()));
-    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    if(is_quantized && (output->data_type() != DataType::S32))
-    {
-        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
-
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(input1->data_type()),
-                                 "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(input2->data_type()),
-                                 "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(output->data_type()),
-                                 "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-        kernel_name += "_quantized";
-    }
-    else
-    {
-        kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->data_type()), "-DWRAP", "-DSATURATE");
-        build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
-        build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
-        if(act_info.enabled())
-        {
-            build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-            build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        }
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set scale argument
-    unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
-
-    if(scale_int >= 0 && !is_quantized)
-    {
-        _kernel.setArg(idx++, scale_int);
-    }
-    else
-    {
-        _kernel.setArg(idx++, scale);
-    }
-
-    ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                                                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void CLPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_input1);
-        add_3D_tensor_argument(idx, src_1, slice_input2);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPixelWiseMultiplicationKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
-}
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
-
-Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
-    // Validate in case of configured output
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-    const TensorShape &out_shape    = broadcast_pair.first;
-    const ValidRegion &valid_region = broadcast_pair.second;
-
-    // Auto initialize output if not initialized
-    const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
-    auto_init_if_empty(*output, out_info);
-
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
-    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLComplexPixelWiseMultiplicationKernel::CLComplexPixelWiseMultiplicationKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
-}
-
-void CLComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output, act_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_complex(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_output->data_type()));
-    if(act_info.enabled())
-    {
-        build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-        build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-        build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
-
-    ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void CLComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_input1);
-        add_3D_tensor_argument(idx, src_1, slice_input2);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLComplexPixelWiseMultiplicationKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 74102fd397..0000000000
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the pixelwise multiplication kernel. */
-class CLPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
-    /** Default constructor.*/
-    CLPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPixelWiseMultiplicationKernel(const CLPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPixelWiseMultiplicationKernel &operator=(const CLPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPixelWiseMultiplicationKernel(CLPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
-     *
-     * @param[in]  input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[out] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[out] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
-     *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] scale           Scale to apply after multiplication.
-     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensorInfo *_input1;
-    const ITensorInfo *_input2;
-    ITensorInfo       *_output;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class CLComplexPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
-    /** Default constructor.*/
-    CLComplexPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComplexPixelWiseMultiplicationKernel(const CLComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComplexPixelWiseMultiplicationKernel &operator=(const CLComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLComplexPixelWiseMultiplicationKernel(CLComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLComplexPixelWiseMultiplicationKernel &operator=(CLComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1   An input tensor info. Data types supported: F16/F32. Number of channels supported: 2.
-     * @param[in]  input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor info. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  input2          An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output          The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplicationKernel
-     *
-     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
-     * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensorInfo *_input1;
-    const ITensorInfo *_input2;
-    ITensorInfo       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 7b9caf0063..7dcdf1de6f 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
@@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *input1,
+                                                        const ITensorInfo       *input2,
+                                                        ITensorInfo             *output,
+                                                        const PriorBoxLayerInfo &info,
+                                                        int                      num_priors)
 {
     ARM_COMPUTE_UNUSED(input2);
     // Output tensor auto initialization if not yet initialized
@@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     auto_init_if_empty(*output, output_shape, 1, input1->data_type());
 
     const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
-    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window                 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, output_access);
-    Status                 err            = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status                 err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -99,15 +109,28 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr), _info(), _num_priors(), _min(), _max(), _aspect_ratios()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
 }
 
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
-                                      cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext  &compile_context,
+                                      const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -134,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     int img_width  = info.img_size().x;
     int img_height = info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = input2->info()->dimension(width_idx);
         img_height = input2->info()->dimension(height_idx);
@@ -142,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     float step_x = info.steps()[0];
     float step_y = info.steps()[0];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -161,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
     build_opts.add_option_if(info.clip(), "-DIN_PLACE");
 
-    if(info.variances().size() > 1)
+    if (info.variances().size() > 1)
     {
-        for(unsigned int i = 0; i < info.variances().size(); ++i)
+        for (unsigned int i = 0; i < info.variances().size(); ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(i)));
         }
     }
     else
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(0)));
         }
     }
 
@@ -193,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
     const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+                                                              output->clone().get(), info, num_priors)
+                                    .first);
 
     return Status{};
 }
@@ -210,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
-    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
-    if(!_info.max_sizes().empty())
+    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+                             _info.aspect_ratios().data());
+    if (!_info.max_sizes().empty())
     {
         queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
     }
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
index 6c369a7a4e..a50e0c5ff5 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -57,7 +57,13 @@ public:
      * @param[in]  max           Maximum prior box values
      * @param[in]  aspect_ratios Aspect ratio values
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+    void configure(const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -69,8 +75,14 @@ public:
      * @param[in]  max             Maximum prior box values
      * @param[in]  aspect_ratios   Aspect ratio values
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
-                   cl::Buffer *aspect_ratios);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -80,14 +92,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
+    const ICLTensor  *_input1;
+    const ICLTensor  *_input2;
     ICLTensor        *_output;
     PriorBoxLayerInfo _info;
     int               _num_priors;
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index ccc61783c4..731fcb8e04 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,12 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -47,15 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
     /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
     /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
-    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+                                                           ? input->dimension(0)
+                                                           : temp_num_elems_processed_per_iteration;
 
     // This kernel doesn't need padding
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     return std::make_pair(Status{}, win);
 }
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const ITensorInfo *weight,
+                          const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
 
@@ -71,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -83,12 +91,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
     : _input(nullptr), _weight(nullptr), _bias(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *weight,
+                                                const ICLTensor        *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
-    auto padding_info = get_padding_info({ input, weight, bias, output });
+    auto padding_info = get_padding_info({input, weight, bias, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
 
@@ -102,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     int32_t                       output_multiplier{};
     int32_t                       output_shift{};
     const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
-    const Status                  status    = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+    const Status                  status =
+        quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
     output_shift *= -1;
 
     // Set build options
@@ -112,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
     build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-    build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
-    build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMIN_BOUND=" +
+                          support::cpp11::to_string(std::get<0>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMAX_BOUND=" +
+                          support::cpp11::to_string(std::get<1>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
 
     // Create kernel
     _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -133,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *weight,
+                                                const ICLTensor *bias)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
 }
 
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -169,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 31085c37ba..ba912e1d2d 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -63,7 +63,11 @@ public:
      * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
      * @param[in]  bias            Bias tensor. Data types supported: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *weight,
+                   const ICLTensor        *bias);
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
      *
      * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
@@ -73,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
deleted file mode 100644
index 76e703f0dd..0000000000
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    // Output must always be initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-    return Status{};
-}
-} // namespace
-
-CLQuantizationLayerKernel::CLQuantizationLayerKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLQuantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    auto padding_info = get_padding_info({ input, output });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    const int  vec_size_x     = 16 / input->info()->element_size();
-    const int  input_width_x  = input->info()->tensor_shape().x();
-    const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
-    const UniformQuantizationInfo qinfo            = output->info()->quantization_info().uniform();
-    const DataType                output_data_type = output->info()->data_type();
-
-    float   scale_to_apply  = qinfo.scale;
-    int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
-    {
-        /*
-         * In case of requantization of a quantized input tensor to an output tensor with another quantization
-         * instead of of apply dequantization and then a quantization functions, we just compute new scale and
-         * offset to apply.
-         *
-         * Assuming:
-         *   - q_i as input quantized value
-         *   - q_o as output quantized value
-         *   - z_i as input quantization offset value
-         *   - z_o as output quantization offset value
-         *   - s_i as input quantization scale value
-         *   - s_o as output quantization scale value
-         *   - z_n as new quantization offset value
-         *   - s_n as new quantization scale value
-         *
-         * q_o = ( q_i - z_i ) * s_i / s_o + z_o
-         *
-         * We can rewrite the formula as:
-         *
-         * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
-         *
-         * q_o = q_i / s_n + z_n
-         *
-         * Where:
-         *
-         * s_n = s_o / s_i
-         *
-         * z_n = - z_i * s_i / s_o + z_o
-         *
-         */
-        const UniformQuantizationInfo qinfo_in = _input->info()->quantization_info().uniform();
-        scale_to_apply /= qinfo_in.scale;
-        // In order to minimize flooring we convert the offset to a float,
-        // then compute the new offset in the float domain,
-        // finally we convert it back as int32_t
-        offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
-    }
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(is_data_type_float(_input->info()->data_type()), "-DIS_FLOAT");
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
-    build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
-    build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
-
-    _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    return Status{};
-}
-
-void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.h b/src/core/CL/kernels/CLQuantizationLayerKernel.h
deleted file mode 100644
index e9d03decb3..0000000000
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors.
- */
-class CLQuantizationLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLQuantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQuantizationLayerKernel(const CLQuantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQuantizationLayerKernel &operator=(const CLQuantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLQuantizationLayerKernel(CLQuantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLQuantizationLayerKernel &operator=(CLQuantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLQuantizationLayerKernel() = default;
-    /** Set the input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output          Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] output Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 87f4a5d7f3..c97910ef79 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,14 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,24 +43,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -81,14 +85,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITe
 CLROIAlignLayerKernel::CLROIAlignLayerKernel()
     : _input(nullptr), _output(nullptr), _rois(nullptr), _pool_info(0, 0, 0.f)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext    &compile_context,
+                                      const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
@@ -98,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
     output->info()->set_data_layout(input->info()->data_layout());
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     _input     = input;
     _output    = output;
@@ -112,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
-    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
-    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+    build_opts.add_option("-DMAX_DIM_X=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+    build_opts.add_option("-DMAX_DIM_Y=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+    build_opts.add_option("-DMAX_DIM_Z=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
     build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
     build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
-    build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+    build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+                             "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         const UniformQuantizationInfo iq_info    = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -145,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
index cbf0e00165..2e84e5d303 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,7 @@
 #ifndef ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
 #define ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLArray.h"
 #include "src/core/CL/ICLKernel.h"
-
 namespace arm_compute
 {
 class ICLTensor;
@@ -63,7 +61,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -79,7 +78,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -95,7 +98,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue);
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 43492a3d50..1b2c414a49 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,82 +25,82 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
+#include <cfloat>
 #include <cmath>
-#include <set>
 #include <string>
 
 namespace arm_compute
 {
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
+    : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
-    auto_init_if_empty((*output), output_shape, 1, input->data_type());
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    _type = CLKernelType::ELEMENTWISE;
 }
-} // namespace
 
-CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
-    : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+
+    //Validate arguments
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+    if (output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
+        ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+    }
+
+    return Status{};
 }
 
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        ICLTensor                 *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        const ICLTensor           *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
 
-    //Validate arguments
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
-    ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
-    ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
-    ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-
-    if(output->info()->total_size() != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
-    }
+    auto padding_info = get_padding_info({input, rois, output});
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    // Output auto initialization if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
+    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     // Set instance variables
     _input     = input;
@@ -108,27 +108,46 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
     _output    = output;
     _pool_info = pool_info;
 
+    const DataType data_type = input->info()->data_type();
+    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
+
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX))));
-    build_opts.emplace(("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY))));
-    build_opts.emplace(("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ))));
-    build_opts.emplace(("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())));
-    build_opts.emplace(("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())));
-    build_opts.emplace(("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(data_type));
+    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
+    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
+    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+    build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
+    build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
+    build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
+
+    if (is_qasymm)
+    {
+        // Determine quantization info scale, offset
+        UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
+        uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+                                                     _output->info()->quantization_info().uniform());
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
+
+        // Specify minimum possible value of datatype
+        build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(0));
+    }
+    else
+    {
+        // Specify min value of F32 datatype
+        build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(-FLT_MAX));
+    }
+
+    Window win = calculate_max_window(*(output->info()), Steps());
+    ICLKernel::configure_internal(win);
 
     // Create kernel
     std::string kernel_name = "roi_pooling_layer";
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_3D_tensor() + num_arguments_per_1D_array();
-    add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
-    add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
-    ICLKernel::configure_internal(win_config.second);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 35f42a9676..80bfb63092 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,6 @@
 #define ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
 
 #include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/CL/ICLArray.h"
-
 namespace arm_compute
 {
 class ICLTensor;
@@ -62,11 +59,12 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F16/F32.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32/QASYMM8
      * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
@@ -77,15 +75,37 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
+    /** Static Validate function to check inputs will lead to valid configuration of @ref CLROIPoolingLayer
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32/QASYMM8
+     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
+
 private:
     const ICLTensor    *_input;
     const ICLTensor    *_rois;
-    ICLTensor          *_output;
+    const ICLTensor    *_output;
     ROIPoolingLayerInfo _pool_info;
 };
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index 85f79988c9..622f6210b9 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -40,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
 
@@ -54,20 +54,24 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-CLRangeKernel::CLRangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLRangeKernel::configure(ICLTensor *output, const float start, const float end, const float step)
@@ -75,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
 
     // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
-    Window       win                               = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    auto padding_info = get_padding_info({ output });
+    auto padding_info = get_padding_info({output});
 
     _start  = start;
     _end    = end;
@@ -97,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
     build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
-    if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
     {
         const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
         build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));
@@ -135,4 +142,4 @@ void CLRangeKernel::run(const Window &window, cl::CommandQueue &queue)
 
     enqueue(queue, *this, window, lws_hint());
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
index 1b94a099ed..65251a11e5 100644
--- a/src/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLRANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 2697a0df98..c8665f8fbd 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,42 +28,47 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-// OpenCL kernel requires input width to be a power of 2 for x-axis.
-constexpr unsigned int border_val = 64;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+                                    "Not supported reduction operation for QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (width == 0) && (input->data_type() != DataType::QASYMM8) && (input->data_type() != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+                                (input->data_type() != DataType::QASYMM8) &&
+                                (input->data_type() != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+                                    "Not supported reduction operation, use CLArgMinMaxLayer");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -71,85 +76,50 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 
     return Status{};
 }
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
-{
-    // Output tensor auto initialization if not yet initialized
-    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, true);
-    DataType          output_data_type = input->data_type();
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
-    const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
-    Window             win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool               window_changed                    = false;
-    const bool         is_serial_op                      = needs_serialized_reduction(op, input->data_type(), axis);
-
-    switch(axis)
-    {
-        case 0:
-        {
-            if(!is_serial_op)
-            {
-                const unsigned int     border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
-                AccessWindowStatic     input_access(input, 0, 0, input->dimension(0) + border_width, 1);
-                AccessWindowHorizontal output_access(output, 0, 1);
-                window_changed = update_window_and_padding(win, input_access, output_access);
-                output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-            }
-        }
-        break;
-        case 1:
-        case 2:
-        case 3:
-        {
-            AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, input_access, output_access);
-            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
-    return std::make_tuple(err, win);
-}
 } // namespace
 
 CLReductionOperationKernel::CLReductionOperationKernel()
-    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-BorderSize CLReductionOperationKernel::border_size() const
+void CLReductionOperationKernel::configure(const ICLTensor   *input,
+                                           ICLTensor         *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
-    return _border_size;
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, width);
-}
-
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           unsigned int            axis,
+                                           ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+
     // Set build options
     CLBuildOptions build_opts;
     DataType       data_type = input->info()->data_type();
     std::string    data_type_promoted{};
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         data_type_promoted = "int";
     }
@@ -158,8 +128,15 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
         data_type_promoted = get_cl_type_from_data_type(data_type);
     }
 
+    const unsigned int width             = input->info()->dimension(0) * input->info()->num_channels();
+    unsigned int       vec_size          = (is_data_type_quantized(input->info()->data_type()) && (axis == 0)) ? 1 : 16;
+    vec_size                             = adjust_vec_size(vec_size, width);
+    const unsigned int vec_size_leftover = width % vec_size;
+
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option_if(is_data_type_float(data_type), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
@@ -167,11 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
-    build_opts.add_option_if(input->info()->num_channels() == 2, "-DCOMPLEX");
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
-
-    switch(op)
+    build_opts.add_option_if(is_data_type_quantized(data_type),
+                             "-DOFFSET=" +
+                                 support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+    build_opts.add_option_if(
+        is_data_type_quantized(data_type),
+        "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+
+    switch (op)
     {
         case ReductionOperation::SUM_SQUARE:
             build_opts.add_option(("-DOPERATION=square_sum"));
@@ -181,7 +161,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
             build_opts.add_option(("-DOPERATION=sum"));
             break;
         case ReductionOperation::MIN:
+            build_opts.add_option(("-DOPERATION=min_"));
+            break;
         case ReductionOperation::MAX:
+            build_opts.add_option(("-DOPERATION=max_"));
             break;
         case ReductionOperation::PROD:
             build_opts.add_option(("-DOPERATION=product"));
@@ -191,30 +174,15 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     }
 
     // Create kernel
-    cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
     std::string kernel_axis_name;
     const bool  is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
-            if(is_serial_op)
-            {
-                build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F16, "-DCOND_DATA_TYPE=short", "-DCOND_DATA_TYPE=int");
-                kernel_axis_name = "non_parallel_x";
-            }
-            else
-            {
-                build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
-                const unsigned int width_leftover = input->info()->dimension(0) % border_val;
-                const unsigned int border_width   = (width_leftover != 0) ? border_val - width_leftover : 0;
-                kernel_axis_name                  = "x";
-
-                lws_hint     = create_lws_hint_parallel_implementations(input->info()->dimension(0), border_val);
-                _border_size = BorderSize(0, border_width, 0, 0);
-            }
+            build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(width));
+            kernel_axis_name = ((is_serial_op) ? "non_parallel_x" : "x");
         }
         break;
         case 1:
@@ -236,18 +204,21 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     _kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
+    TensorShape actual_input_shape = input->info()->tensor_shape();
+    actual_input_shape[0]          = width;
 
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    Window win = calculate_max_window(actual_input_shape, Steps(vec_size));
+    ICLKernel::configure_internal(win);
 
-    ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
-
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
 }
 
@@ -257,18 +228,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
             // We use parallel reduction only in non quantized types
-            if(is_serial_op)
+            if (is_serial_op)
             {
                 // Get first input and output slices
-                Window window_in{ window };
-                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+                Window window_in{window};
+                window_in.set(Window::DimX,
+                              Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
 
-                Window out_window{ window };
+                Window out_window{window};
                 out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
 
                 Window in_slice  = window_in.first_slice_window_1D();
@@ -279,91 +251,114 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                     unsigned int idx = 0;
                     add_1D_tensor_argument(idx, _input, in_slice);
                     add_1D_tensor_argument(idx, _output, out_slice);
-                    enqueue(queue, *this, in_slice, lws_hint());
-                }
-                while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+                    enqueue(queue, *this, in_slice);
+                } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
             }
             else
             {
                 // Set out window
-                Window out_window(window);
-                out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+                bool   has_collapsed = true;
+                Window window_in     = window.collapse_if_possible(window, 2, &has_collapsed);
+                ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-                // Get first input and output slices
-                Window in_slice  = window.first_slice_window_2D();
-                Window out_slice = out_window.first_slice_window_2D();
-
-                // Reshape window
-                const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
-                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
-
-                // Set local sums buffer
-                unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
-                _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
+                Window window_out = window_in;
+                window_out.set(0, Window::Dimension());
 
-                do
-                {
-                    unsigned int idx = 0;
-                    add_2D_tensor_argument(idx, _input, in_slice);
-                    add_2D_tensor_argument(idx, _output, out_slice);
-                    enqueue(queue, *this, in_slice, lws_hint());
-                }
-                while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, window_in);
+                add_3D_tensor_argument(idx, _output, window_out);
+                enqueue(queue, *this, window_in);
             }
         }
         break;
         case 1:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
-            Window in_slice  = window_in.first_slice_window_2D();
-            Window out_slice = window.first_slice_window_2D();
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, in_slice);
-                add_2D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            actual_window = actual_window.shift_dimensions(1, Window::DimY);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 2:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
-            Window in_slice  = window_in.first_slice_window_3D();
-            Window out_slice = window.first_slice_window_3D();
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, _input, in_slice);
-                add_3D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            actual_window = actual_window.shift_dimensions(1, Window::DimZ);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 3:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(3, Window::Dimension(0, 1, 1));
-            Window in_slice  = window_in.first_slice_window_4D();
-            Window out_slice = window.first_slice_window_4D();
+            bool   has_collapsed = true;
+            Window actual_window = window.shift_dimensions(1, Window::DimW);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_4D_tensor_argument(idx, _input, in_slice);
-                add_4D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
index ff9fd61484..2f94b2add3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -56,9 +57,8 @@ public:
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in]  width  (Optional)  In case of x-axis we also need to provide the width of the input image.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -67,9 +67,12 @@ public:
      *                             Output will have the same number of dimensions as input.
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in]  width           (Optional)  In case of x-axis we also need to provide the width of the input image.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
@@ -78,22 +81,20 @@ public:
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in] op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in] width  (Optional)  In case of x-axis we also need to provide the width of the input image.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
 
 private:
     const ICLTensor   *_input;
     ICLTensor         *_output;
     unsigned int       _reduction_axis;
     ReductionOperation _op;
-    BorderSize         _border_size;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
deleted file mode 100644
index 0ebeefcc74..0000000000
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLRemapKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-CLRemapKernel::CLRemapKernel()
-    : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
-{
-}
-
-BorderSize CLRemapKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_undefined);
-}
-
-void CLRemapKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
-                              bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
-    ARM_COMPUTE_UNUSED(border_undefined);
-
-    _input  = input;
-    _output = output;
-    _map_x  = map_x;
-    _map_y  = map_y;
-
-    // Create kernel
-    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
-    std::string           interpolation_name = string_from_interpolation_policy(policy);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "remap_" + interpolation_name;
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    const int total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
-    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
-
-    Window             win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
-
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set static arguments
-    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_float>(idx++, input->info()->dimension(0));
-    _kernel.setArg<cl_float>(idx++, input->info()->dimension(1));
-}
-
-void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        add_2D_tensor_argument(idx, _map_x, slice);
-        add_2D_tensor_argument(idx, _map_y, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLRemapKernel.h b/src/core/CL/kernels/CLRemapKernel.h
deleted file mode 100644
index 8efcf091ed..0000000000
--- a/src/core/CL/kernels/CLRemapKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAPKERNEL_H
-#define ARM_COMPUTE_CLREMAPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a remap on a tensor */
-class CLRemapKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLRemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel(const CLRemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel(CLRemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel &operator=(CLRemapKernel &&) = default;
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
-     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy           The interpolation type.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
-     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy           The interpolation type.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_map_x;
-    const ICLTensor *_map_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREMAPKERNEL_H */
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 01853450ee..9fd21943e8 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -50,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -65,9 +70,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLReorgLayerKernel::CLReorgLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t stride)
@@ -75,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   ICLTensor              *output,
+                                   int32_t                 stride)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    std::string  kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
-    const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    std::string kernel_name =
+        std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+    const size_t idx_channel =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -96,12 +106,13 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
 
     // Configure window
     // auto inizialize the output tensor if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
 
     Window win = calculate_max_window(*output->info(), Steps());
 
     // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
     ICLKernel::configure_internal(win);
 
     _config_id = kernel_name;
@@ -118,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+                                    const arm_compute::ITensorInfo *output,
+                                    int32_t                         stride)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
 
@@ -138,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
index 455a6170c6..f335071e9f 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREORGLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index b3c9bcafd1..00241b161b 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,9 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,17 +40,21 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+                                    "Current implementation only supports up to 4 dimensions.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -58,20 +65,27 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLReverseKernel::CLReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const ICLTensor *input,
+                                ICLTensor       *output,
+                                const ICLTensor *axis,
+                                bool             use_inverted_axis)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
 }
 
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input,
+                                ICLTensor              *output,
+                                const ICLTensor        *axis,
+                                bool                    use_inverted_axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
-    auto padding_info = get_padding_info({ input, output, axis });
+    auto padding_info = get_padding_info({input, output, axis});
 
     _input  = input;
     _output = output;
@@ -80,12 +94,14 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info(), use_inverted_axis));
 
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+    build_opts.add_option("-DRANK=" + support::cpp11::to_string(input->info()->num_dimensions()));
+    build_opts.add_option_if(use_inverted_axis, "-DUSE_INVERTED_AXIS");
 
     // Create kernel
     _kernel = create_kernel(compile_context, "reverse", build_opts.options());
@@ -113,9 +129,12 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverseKernel::validate(const ITensorInfo *input,
+                                 const ITensorInfo *output,
+                                 const ITensorInfo *axis,
+                                 bool               use_inverted_axis)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
     return Status{};
 }
 
@@ -135,7 +154,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_1D_tensor_argument(idx, _axis, axis_slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
index 4a21e4f802..a630aec15a 100644
--- a/src/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
-#define ARM_COMPUTE_CLREVERSEKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
 
 #include "src/core/CL/ICLKernel.h"
 
@@ -48,29 +48,43 @@ public:
     ~CLReverseKernel() = default;
     /** Initialise the kernel's inputis and output
      *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
+     *
+     * @note The value of each axis should be between [-rank, rank)
+     * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+     *
+     * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
+     *
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
     /** Initialise the kernel's inputis and output
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  compile_context   The compile context to be used.
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis,
+                   bool                    use_inverted_axis);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All.
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All.
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -81,4 +95,4 @@ public:
     const ICLTensor *_axis;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREVERSEKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
deleted file mode 100644
index c2c78c8f6d..0000000000
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLScaleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include "src/core/utils/ScaleUtils.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-inline std::pair<float, float> calculate_scale_factors(const ITensorInfo &input, const ITensorInfo &output, const ScaleKernelInfo &info)
-{
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input.data_layout() : info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source width/height and destination width/height
-    const unsigned int input_width   = input.dimension(idx_width);
-    const unsigned int input_height  = input.dimension(idx_height);
-    const unsigned int output_width  = output.dimension(idx_width);
-    const unsigned int output_height = output.dimension(idx_height);
-
-    float wr = arm_compute::scale_utils::calculate_resize_ratio(input_width, output_width, info.align_corners);
-    float hr = arm_compute::scale_utils::calculate_resize_ratio(input_height, output_height, info.align_corners);
-
-    return std::make_pair(wr, hr);
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(output == input);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-
-    float wr = 0.f;
-    float hr = 0.f;
-    std::tie(wr, hr) = calculate_scale_factors(*input, *output, info);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const ScaleKernelInfo &info, BorderSize &border)
-{
-    Window           win{};
-    bool             window_changed{};
-    unsigned int     num_elems_processed_per_iteration = 0;
-    const DataLayout data_layout                       = info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : info.data_layout;
-
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            if(info.border_mode == BorderMode::UNDEFINED)
-            {
-                border = BorderSize(0);
-            }
-
-            num_elems_processed_per_iteration = 4;
-            // Configure kernel window
-            win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowStatic input_access(input,
-                                            -border.left, -border.top,
-                                            input->dimension(0) + border.right,
-                                            input->dimension(1) + border.bottom);
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-            output_access.set_valid_region(win, calculate_valid_region_scale(*(input),
-                                                                             output->tensor_shape(),
-                                                                             info.interpolation_policy,
-                                                                             info.sampling_policy,
-                                                                             info.border_mode == BorderMode::UNDEFINED));
-
-            window_changed = update_window_and_padding(win, input_access, output_access);
-        }
-        break;
-        case DataLayout::NHWC:
-        {
-            // Configure kernel window
-            win = calculate_max_window(*output, Steps());
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize CLScaleKernel::border_size() const
-{
-    return BorderSize(static_cast<size_t>(_data_layout == DataLayout::NCHW));
-}
-
-Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : info.data_layout;
-    BorderSize       border      = BorderSize(static_cast<size_t>(data_layout == DataLayout::NCHW));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info, border).first);
-
-    return Status{};
-}
-
-const ICLTensor *CLScaleKernel::input() const
-{
-    return _input;
-}
-
-const ICLTensor *CLScaleKernel::output() const
-{
-    return _output;
-}
-
-void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
-    auto padding_info = get_padding_info({ input, output });
-
-    _input                = input;
-    _output               = output;
-    _interpolation_policy = info.interpolation_policy;
-    _data_layout          = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
-    _align_corners        = info.align_corners;
-
-    float wr = 0.f;
-    float hr = 0.f;
-    std::tie(wr, hr) = calculate_scale_factors(*input->info(), *output->info(), info);
-
-    const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && _interpolation_policy == InterpolationPolicy::BILINEAR;
-
-    const int  idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const bool is_nhwc    = _data_layout == DataLayout::NHWC;
-
-    // Compute actual border size
-    BorderSize border = border_size();
-
-    auto interpolation_policy_to_use = _interpolation_policy;
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    if(_interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
-    {
-        interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), info, border);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, input->info()->data_type()));
-    build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
-    build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
-    build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
-    build_opts.add_option_if(_align_corners, "-DALIGN_CORNERS");
-    if(call_quantized_kernel)
-    {
-        const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
-        build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
-    }
-    std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "scale_" + interpolation_name;
-    kernel_name += call_quantized_kernel ? "_quantized_" : "_";
-    kernel_name += lower_string(string_from_data_layout(_data_layout));
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-
-    const unsigned int input_width  = input->info()->dimension(idx_width);
-    const unsigned int input_height = input->info()->dimension(idx_height);
-
-    _kernel.setArg<float>(idx++, input_width);
-    _kernel.setArg<float>(idx++, input_height);
-    _kernel.setArg<float>(idx++, wr);
-    _kernel.setArg<float>(idx++, hr);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "scale_";
-    _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
-    _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
-    _config_id += (is_nhwc ? "nhwc" : "nchw");
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(3));
-    if(is_nhwc)
-    {
-        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-    }
-}
-
-void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window.first_slice_window_2D();
-
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, slice);
-                add_2D_tensor_argument(idx, _output, slice);
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
-            Window slice     = collapsed.first_slice_window_4D();
-
-            unsigned int idx = 0;
-            add_4D_tensor_argument(idx, _input, slice);
-            add_4D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice, lws_hint());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLScaleKernel.h b/src/core/CL/kernels/CLScaleKernel.h
deleted file mode 100644
index a72e3938d9..0000000000
--- a/src/core/CL/kernels/CLScaleKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCALEKERNEL_H
-#define ARM_COMPUTE_CLSCALEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the scale kernel */
-class CLScaleKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] output Destination tensor. Data types supported: Same as @p input
-     *                    All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info   @ref ScaleKernelInfo Kernel descriptor to be used to configure.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLScaleKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] output Destination tensor info. Data types supported: Same as @p input
-     *                   All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] info   @ref ScaleKernelInfo Kernel descriptor to be used to validate
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
-    /** Input tensor accessor.
-     *
-     * @return Pointer to input tensor.
-     */
-    const ICLTensor *input() const;
-    /** Output tensor accessor.
-     *
-     * @return Pointer to output tensor.
-     */
-    const ICLTensor *output() const;
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-    // Getter for interpolation policy
-    InterpolationPolicy get_interpolation_policy() const
-    {
-        return _interpolation_policy;
-    }
-
-private:
-    InterpolationPolicy _interpolation_policy = InterpolationPolicy::BILINEAR;
-    DataLayout          _data_layout          = DataLayout::UNKNOWN;
-    bool                _align_corners        = false;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCALEKERNEL_H */
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index f8e63ddc43..703c64d8d3 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,10 +29,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -50,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -62,12 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 }
 } // namespace
 
-CLSelectKernel::CLSelectKernel()
-    : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *c,
+                               const ICLTensor        *x,
+                               const ICLTensor        *y,
+                               ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -78,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    auto               padding_info         = get_padding_info({ c, x, y, output });
+    auto               padding_info         = get_padding_info({c, x, y, output});
     const unsigned int vec_size_x           = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
     const int          vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
 
@@ -90,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Create kernel
     std::string kernel_name = "select";
-    if(_has_same_rank)
+    if (_has_same_rank)
     {
         kernel_name += "_same_rank";
     }
     else
     {
         const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
-        if(is_input_rank_greater_than_two)
+        if (is_input_rank_greater_than_two)
         {
             const size_t width      = x->info()->tensor_shape().x();
             const size_t height     = x->info()->tensor_shape().y();
@@ -126,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
     return Status{};
@@ -140,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    if(!_has_same_rank)
+    if (!_has_same_rank)
     {
         Window vector_slice = window.first_slice_window_1D();
         vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -151,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     do
     {
         unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
-        if(_has_same_rank)
+        if (_has_same_rank)
         {
             add_3D_tensor_argument(idx, _c, slice);
         }
@@ -160,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
         add_3D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
index b8c10cd7cf..c4256fd743 100644
--- a/src/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,11 @@ public:
      * @param[out] y               Second input tensor. Data types supported: Same as @p x
      * @param[in]  output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
deleted file mode 100644
index 526d9e187d..0000000000
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
-
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
- *
- * Prepares these build options:
- * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
- * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
- *             it defines whether the value will be taken into account or not.
- *
- * @param[in] build_opts  Build options to extend
- * @param[in] input_scale Input scaling factor
- * @param[in] beta        Exponent scaling factor beta
- */
-CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
-{
-    // Number of integer bits in temporary fixed-point representation of current-to-max difference
-    static const int scaled_diff_int_bits = 5;
-    // Number of integer bits used in temporary fixed-point representation of exponent accumulator
-    static const int exp_accumulation_in_bits = 12;
-
-    const double beta_multiplier = std::min(
-                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1LL << 31) - 1.0);
-    int input_beta_multiplier;
-    int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
-
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
-    build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
-    build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
-    build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
-    build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
-
-    return build_opts;
-}
-
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
-
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    // Checks performed when sum is configured
-    if(sum->total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool             is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        if(!is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-            ARM_COMPUTE_RETURN_ERROR_ON(output->quantization_info() != allowed_quantization_info);
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-/**< Grid size (obtained through auto-tuning) */
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
-/**< Vector size in the serial case (obtained through auto-tuning) */
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
-/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
-
-CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
-    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
-{
-}
-
-void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, max, output, sum, info);
-}
-
-void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
-
-    auto padding_info = get_padding_info({ input, max, output, sum });
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), input->info()->clone()->set_tensor_shape(max->info()->tensor_shape()));
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
-
-    _input  = input;
-    _max    = max;
-    _output = output;
-    _sum    = sum;
-
-    const DataType                dt                 = input->info()->data_type();
-    const UniformQuantizationInfo qinfo              = input->info()->quantization_info().uniform();
-    const size_t                  reduction_dim_size = input->info()->dimension(0);
-    const float                   beta               = info.beta;
-    const auto                    is_signed_qasymm8  = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int                     min_value          = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
-    const unsigned int    vector_size             = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
-    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
-    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
-    build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
-
-    cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "");
-
-    // Configure parallel kernel if needed
-    if(std::get<0>(parallel_reduction_info))
-    {
-        kernel_name += "parallel";
-        bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
-        build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
-
-        // Handle boundary conditions.
-        const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
-        build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
-        // Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
-        // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
-        lws_hint = cl::NDRange(_grid_size);
-    }
-    else
-    {
-        kernel_name += "serial";
-    }
-
-    // Create kernel.
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    Window win = calculate_max_window(*(input->info()), Steps(reduction_dim_size));
-    ICLKernel::configure_internal(win, lws_hint);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(input, max, output, sum));
-    return Status{};
-}
-
-CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
-{
-    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
-    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
-    return std::make_tuple(is_parallel_reduction, vector_size);
-}
-
-void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Collapse window in Z dimension
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    // Reconfigure window in case of parallel reduction
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
-    if(std::get<0>(parallel_reduction_info))
-    {
-        // Launch grid_size parallel work items
-        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
-    }
-
-    // Get slices
-    Window slice = window_collapsed.first_slice_window_3D();
-    do
-    {
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _max, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        add_3D_tensor_argument(idx, _sum, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-
-CLLogits1DNormKernel::CLLogits1DNormKernel()
-    : _input(nullptr), _sum(nullptr), _output(nullptr)
-{
-}
-
-void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, info);
-}
-
-void CLLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
-
-    auto padding_info = get_padding_info({ input, output, sum });
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType                output_data_type          = info.input_data_type;
-    const QuantizationInfo        allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo                     = input->info()->quantization_info().uniform();
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(),
-                       input->info()->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(input->info(), sum->info(), output->info(), info));
-
-    _input  = input;
-    _sum    = sum;
-    _output = output;
-
-    const auto         is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int          min_value         = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-    const unsigned int vector_size       = adjust_vec_size(16, input->info()->dimension(0));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
-    build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
-    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-
-    // Create kernel
-    std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    auto win = calculate_max_window(*(input->info()), Steps(vector_size));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLLogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(input, sum, output, info));
-
-    return Status{};
-}
-
-void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        Window sum_slice = slice;
-        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _sum, sum_slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.h b/src/core/CL/kernels/CLSoftmaxLayerKernel.h
deleted file mode 100644
index 29e0f63e46..0000000000
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLSimple3DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for max, shifting, exponentiating and summing the logits */
-class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
-{
-public:
-    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
-    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
-
-public:
-    /** Default constructor */
-    CLLogits1DMaxShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]     input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in,out] max    Max values tensor. Data types supported: same as @p input
-     * @param[out]    output Destination tensor. Data types supported: same as @p input
-     * @param[out]    sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     * @param[in]     info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in,out] max             Max values tensor. Data types supported: same as @p input
-     * @param[out]    output          Destination tensor. Data types supported: same as @p input
-     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p input
-     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] max    Max values tensor. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
-    /** Checks if the given size is eligible for parallel reduction
-     *
-     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
-     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
-     *
-     * @param[in] size Size to check
-     *
-     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
-     *         while the second element is the vector size of the execution.
-     */
-    static ParallelReductionInfo is_parallel_reduction(size_t size);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_max;
-    ICLTensor       *_output;
-    ICLTensor       *_sum;
-
-private:
-    static const unsigned int _grid_size;
-    static const unsigned int _serial_vector_size;
-    static const unsigned int _parallel_vector_size;
-};
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class CLLogits1DNormKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLogits1DNormKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DNormKernel(const CLLogits1DNormKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DNormKernel &operator=(const CLLogits1DNormKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DNormKernel(CLLogits1DNormKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
-     * @param[in]  info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output          Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
-     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in] sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in] info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_sum;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 57f7af488b..f4c0839ad2 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,19 +39,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -60,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -69,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -84,18 +94,27 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
     : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const ICLTensor *block_shape,
+                                          const ICLTensor *paddings,
+                                          ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          const ICLTensor        *paddings,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
-    auto padding_info = get_padding_info({ input, block_shape, paddings, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    auto padding_info = get_padding_info({input, block_shape, paddings, output});
 
     _input       = input;
     _block_shape = block_shape;
@@ -109,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
-    _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -124,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const int        block_shape_x,
+                                          const int        block_shape_y,
+                                          const Size2D    &padding_left,
+                                          const Size2D    &padding_right,
+                                          ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                          const Size2D &padding_right,
-                                          ICLTensor    *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int               block_shape_x,
+                                          const int               block_shape_y,
+                                          const Size2D           &padding_left,
+                                          const Size2D           &padding_right,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input  = input;
     _output = output;
@@ -151,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -164,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
     build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
     build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
-    _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -216,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 4817cfeef2..f9dce9db47 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -73,7 +78,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,8 +94,13 @@ public:
      * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +110,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -107,7 +125,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 4e5b417ec6..25662b5c62 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -63,9 +65,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -73,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
@@ -92,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -134,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index bb1ac5f9a6..d0932919e0 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 9bdcc8dc3f..23e26716e7 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          unsigned int       axis,
+                          unsigned int       idx_input,
+                          unsigned int       num_tensors,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -73,17 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
 }
 } // namespace
 
-CLStackLayerKernel::CLStackLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+    const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
 }
 
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   unsigned int            axis,
+                                   unsigned int            idx_input,
+                                   unsigned int            num_tensors,
+                                   ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -111,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
     _kernel.setArg<cl_uint>(idx, idx_input);
 }
 
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+                                    unsigned int       axis,
+                                    unsigned int       idx_input,
+                                    unsigned int       num_tensors,
+                                    const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
index 2865127a90..d3c17f529c 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,8 @@ public:
      * @param[out] output      Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(
+        const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
     /** Initialise the kernel's inputs and output
      *
      * @note Supported input tensor rank: up to 4
@@ -74,7 +76,12 @@ public:
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   unsigned int            axis,
+                   unsigned int            idx_input,
+                   unsigned int            num_tensors,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -88,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           unsigned int       axis,
+                           unsigned int       idx_input,
+                           unsigned int       num_tensors,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 75fd01df14..20cd835069 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,10 +22,13 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/bit_ops.h"
@@ -36,9 +39,14 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -47,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -68,46 +73,42 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
 
     return Status{};
 }
+} // namespace
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
-                                                        const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                                        int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+CLStridedSliceKernel::CLStridedSliceKernel()
 {
-    // Output tensor auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-
-    // Create window
-    Window win = calculate_max_window(*output, Steps());
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    return std::make_pair(Status{}, win);
+    _type = CLKernelType::ELEMENTWISE;
 }
-} // namespace
 
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *input,
+                                     ITensorInfo            *output,
+                                     const Coordinates      &starts,
+                                     const Coordinates      &ends,
+                                     const BiStrides        &strides,
+                                     int32_t                 begin_mask,
+                                     int32_t                 end_mask,
+                                     int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    auto padding_info = get_padding_info({input, output});
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates starts_abs;
     Coordinates ends_abs;
     Coordinates final_strides;
-    std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                        input_shape,
-                                                        starts, ends, strides,
-                                                        begin_mask, end_mask, shrink_axis_mask);
+    std::tie(starts_abs, ends_abs, final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+    Window win = calculate_max_window(*output, Steps());
 
     // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
     const int  vec_size_x     = 16 / input->element_size();
@@ -116,29 +117,31 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
     // Update window if needed
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        Window &updated_window = std::get<1>(win_config);
+        Window &updated_window = win;
         updated_window.set(Window::DimX,
-                           Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+                           Window::Dimension(updated_window.x().start(),
+                                             ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
     }
-    ICLKernel::configure_internal(win_config.second);
+    ICLKernel::configure_internal(win);
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
-        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(starts_abs[i]));
+        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(final_strides[i]));
         build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
     }
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
-                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
-                                  "-DSRC_DEPTH=1");
     build_opts.add_option_if_else(output->num_dimensions() > 2,
                                   "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
                                   "-DDST_DEPTH=1");
@@ -150,7 +153,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     _config_id = "strided_slice";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->data_type()));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->dimension(i));
@@ -164,14 +167,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
-                                                              starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     return Status{};
 }
@@ -181,8 +187,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_4D();
@@ -193,7 +200,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_4D(slice));
+    } while (window_collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 599cf34c39..1cf5bcacec 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
@@ -35,6 +36,9 @@ namespace arm_compute
 class CLStridedSliceKernel : public ICLKernel
 {
 public:
+    /** Default constructor */
+    CLStridedSliceKernel();
+
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -50,9 +54,15 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask,
+                   int32_t                 end_mask,
+                   int32_t                 shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -68,9 +78,14 @@ public:
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index c0c3d2e2ee..fa996c4008 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLTileKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -54,9 +55,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLTileKernel::CLTileKernel()
-    : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
@@ -64,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *output,
+                             const Multiples        &multiples)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -78,11 +82,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _input  = input;
     _output = output;
 
-    const DataType     data_type      = input->info()->data_type();
-    const int          vec_size_x     = 16 / input->info()->element_size();
-    const int          input_width_x  = input->info()->tensor_shape().x();
-    const unsigned int offset         = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
-    const bool         multi_access_x = (input_width_x / vec_size_x > 0);
+    const DataType     data_type         = input->info()->data_type();
+    const int          vec_size_x        = 16 / input->info()->element_size();
+    const int          input_width_x     = input->info()->tensor_shape().x();
+    const unsigned int input_width_ceil  = ceil_to_multiple(input_width_x, vec_size_x);
+    const unsigned int input_width_tiles = input_width_ceil / vec_size_x;
+    const unsigned int offset            = input_width_ceil - input_width_x;
+    const bool         multi_access_x    = (input_width_x / vec_size_x > 0);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -94,20 +100,20 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles));
     _kernel = create_kernel(compile_context, "tile", build_opts.options());
 
     // Configure window without padding
     Window win = calculate_max_window(*output->info());
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
         // as many threads as those to cover a single tile times multiples[0]. Note that if threads
         // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
         // we don't need to pad the output
         const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+        win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
     }
 
     ICLKernel::configure_internal(win);
@@ -116,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _config_id = "tile";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    for(unsigned int i = 0; i < multiples.size(); ++i)
+    for (unsigned int i = 0; i < multiples.size(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -145,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
         add_4D_tensor_argument(idx, _input, slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
index 41752ca90b..c3486aecef 100644
--- a/src/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -64,7 +64,10 @@ public:
      * @param[out] output          Destination tensor. Same as @p input
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
      *
      * @param[in] input     Source tensor info. Data type supported: All.
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
deleted file mode 100644
index 56ff48be1f..0000000000
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLTransposeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-TensorShape transposed_tensor_shape(const TensorShape &in)
-{
-    TensorShape  output_shape{ in };
-    const size_t w_out = in[1];
-    const size_t h_out = in[0];
-    output_shape.set(0, w_out);
-    output_shape.set(1, h_out);
-
-    return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info = input->clone()->set_tensor_shape(transposed_tensor_shape(input->tensor_shape()));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-Status CLTransposeKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    return Status{};
-}
-
-void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLTransposeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(transposed_tensor_shape(input->info()->tensor_shape())));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-    auto padding_info = get_padding_info({ input, output });
-
-    _input  = input;
-    _output = output;
-
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    const int          vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
-    const unsigned int vec_size_y           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(1));
-    const int          vec_size_y_leftovers = input->info()->dimension(1) % vec_size_y;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(input->info()->element_size()));
-    build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
-    build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers));
-
-    _kernel = create_kernel(compile_context, "transpose", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(vec_size_x, vec_size_y));
-    ICLKernel::configure_internal(win, cl::NDRange(2, 8));
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLTransposeKernel.h b/src/core/CL/kernels/CLTransposeKernel.h
deleted file mode 100644
index 0c4b7b4aff..0000000000
--- a/src/core/CL/kernels/CLTransposeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-#define ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class CLTransposeKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel
-     *
-     * @param[in] input  Input tensor. Data types supported: All.
-     * @param[in] output Output tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTRANSPOSEKERNEL_H */
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
deleted file mode 100644
index 559f47ce26..0000000000
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLWeightsReshapeKernel::CLWeightsReshapeKernel()
-    : _input(nullptr), _biases(nullptr), _output(nullptr)
-{
-}
-
-void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, biases, output, num_groups);
-}
-
-void CLWeightsReshapeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr), num_groups)));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (biases != nullptr) ? biases->info() : nullptr,
-                                                  output->info(), num_groups));
-
-    auto padding_info = get_padding_info({ input, biases, output });
-
-    const DataType data_type = input->info()->data_type();
-
-    _biases = biases;
-    _output = output;
-    _input  = input;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type)));
-    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options());
-
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps());
-    // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output, num_groups));
-    return Status{};
-}
-
-void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    Window biases_window;
-    Window biases_slice;
-
-    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
-    idx += (_biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
-    _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes().z());
-
-    if(_biases != nullptr)
-    {
-        biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
-        biases_slice = biases_window.first_slice_window_1D();
-    }
-
-    do
-    {
-        // Set arguments
-        unsigned idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-        if(_biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, _biases, biases_slice);
-            ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
-        }
-
-        // Run kernel
-        enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.h b/src/core/CL/kernels/CLWeightsReshapeKernel.h
deleted file mode 100644
index 402a60472b..0000000000
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref CLIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class CLWeightsReshapeKernel : public ICLKernel
-{
-public:
-    /** Constructor.*/
-    CLWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~CLWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                        and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                        dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                        Data types supported: Same as @p input
-     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                        Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                             and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases          The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                             dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                             @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output          The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                             Data types supported: Same as @p input
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
-     *
-     * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                       and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in] biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                       dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                       Data types supported: Same as @p input
-     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                       Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
deleted file mode 100644
index bd45ddb65f..0000000000
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    const Size2D kernel_size      = winograd_info.kernel_size;
-    const Size2D output_tile_size = winograd_info.output_tile_size;
-
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(output);
-
-    const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
-    const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-    const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
-
-    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-    return std::make_pair(Status{}, win_collapsed);
-}
-} // namespace
-
-CLWinogradFilterTransformKernel::CLWinogradFilterTransformKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
-}
-
-void CLWinogradFilterTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info(), winograd_info)));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
-    auto padding_info = get_padding_info({ input, output });
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
-    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
-    const Size2D kernel_size      = winograd_info.kernel_size;
-    const Size2D output_tile_size = winograd_info.output_tile_size;
-
-    // Create kernel
-    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(input->info()->data_layout()));
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void CLWinogradFilterTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Setup output window
-    Window window_out;
-    window_out.use_tensor_dimensions(_output->info()->tensor_shape(), 0);
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, window);
-    add_3D_tensor_argument(idx, _output, window_out);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.h b/src/core/CL/kernels/CLWinogradFilterTransformKernel.h
deleted file mode 100644
index d22fedebcd..0000000000
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd filter transform kernel. */
-class CLWinogradFilterTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradFilterTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradFilterTransformKernel(const CLWinogradFilterTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradFilterTransformKernel &operator=(const CLWinogradFilterTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradFilterTransformKernel(CLWinogradFilterTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradFilterTransformKernel &operator=(CLWinogradFilterTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLWinogradFilterTransformKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradFilterTransformKernel
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H */
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
deleted file mode 100644
index 392edda615..0000000000
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
-
-    ARM_COMPUTE_UNUSED(conv_info);
-    ARM_COMPUTE_UNUSED(output_tile_size);
-    ARM_COMPUTE_UNUSED(kernel_size);
-
-    // Validate configured output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    bool   window_changed = false;
-    Window win            = calculate_max_window(*input, Steps(1, 1));
-
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        const PadStrideInfo conv_info        = winograd_info.convolution_info;
-        const Size2D        output_tile_size = winograd_info.output_tile_size;
-        const Size2D        kernel_size      = winograd_info.kernel_size;
-
-        unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
-        unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
-
-        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
-    : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
-{
-}
-
-BorderSize CLWinogradInputTransformKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
-}
-
-void CLWinogradInputTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
-
-    auto padding_info = get_padding_info({ input, output });
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-
-    _data_layout = input->info()->data_layout();
-
-    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input->info()->dimension(idx_w), input->info()->dimension(idx_h)),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-
-    _input       = input;
-    _output      = output;
-    _num_tiles_x = num_tiles.width;
-    _num_tiles_y = num_tiles.height;
-
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input->info(), winograd_info);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
-    const size_t total_batches = input->info()->tensor_shape().total_size_upper(3);
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
-    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-    build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
-    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-    if(_data_layout == DataLayout::NHWC)
-    {
-        build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
-        build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
-        build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    }
-    else
-    {
-        build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    }
-
-    // Create kernel
-    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
-
-    // Get the maximum dimension from the tile size
-    const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
-
-    // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
-    {
-        _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
-    }
-
-    // Append stepz and data layout
-    kernel_name += "_stepz";
-    kernel_name += support::cpp11::to_string(_step_z);
-    kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
-
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Create window and update padding
-    auto win_config = validate_and_configure_window(input->info(), output->info(), winograd_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
-
-    _border_size = BorderSize(_input->info()->padding());
-
-    ARM_COMPUTE_ERROR_ON((input->info()->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
-
-    _config_id = kernel_name;
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_info.pad_left());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_info.pad_top());
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), winograd_info).first);
-
-    return Status{};
-}
-
-void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    const size_t idx_w         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const size_t idx_c         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const size_t total_batches = window.shape().total_size_upper(3);
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    Window slice = window_collapsed.first_slice_window_3D();
-    slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
-    slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
-    if(_data_layout == DataLayout::NHWC)
-    {
-        slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
-    }
-
-    ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
-    slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor();
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.h b/src/core/CL/kernels/CLWinogradInputTransformKernel.h
deleted file mode 100644
index 25301877e6..0000000000
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Winograd input transform.*/
-class CLWinogradInputTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradInputTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradInputTransformKernel(const CLWinogradInputTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradInputTransformKernel &operator=(const CLWinogradInputTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradInputTransformKernel(CLWinogradInputTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradInputTransformKernel &operator=(CLWinogradInputTransformKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransformKernel
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    int              _num_tiles_x;
-    int              _num_tiles_y;
-    unsigned int     _step_z;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H */
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
deleted file mode 100644
index 2018559f60..0000000000
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const unsigned int  num_channels     = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
-
-    // Compute number of elements to process in the X and Y direction
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(bias);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool   window_changed = false;
-
-    if(output->data_layout() == DataLayout::NCHW)
-    {
-        const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
-        const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
-
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-        AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr), _is_nhwc(false)
-{
-}
-
-void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, winograd_info, act_info);
-}
-
-void CLWinogradOutputTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
-                                                const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), winograd_info)));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    _input   = input;
-    _bias    = bias;
-    _output  = output;
-    _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
-
-    // Compute num_tiles_x
-    const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-    const size_t total_batches = output->info()->tensor_shape().total_size_upper(3);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-    build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-
-    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
-    {
-        build_opts.add_option("-DVEC_SIZE=2");
-    }
-    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
-    {
-        build_opts.add_option("-DVEC_SIZE=4");
-    }
-
-    build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
-    build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
-    build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-    build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(idx_width)));
-    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(idx_height)));
-    build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
-    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
-
-    // Create kernel
-    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info.output_tile_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
-}
-
-Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(), winograd_info.output_tile_size).first);
-
-    return Status{};
-}
-
-void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    // Get initial windows
-    Window slice = window_collapsed.first_slice_window_4D();
-    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    // Setup output slice
-    Window slice_out(slice);
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    if(_bias != nullptr)
-    {
-        unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
-        Window       slice_biases;
-        slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
-        add_1D_tensor_argument(idx1, _bias, slice_biases);
-    }
-
-    if(_is_nhwc)
-    {
-        unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
-        _kernel.setArg(idx2, static_cast<int>(_output->info()->total_size() - _output->info()->strides_in_bytes().y()));
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, slice);
-        add_4D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.h b/src/core/CL/kernels/CLWinogradOutputTransformKernel.h
deleted file mode 100644
index 632a5629d9..0000000000
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd output transform kernel. */
-class CLWinogradOutputTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradOutputTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLWinogradOutputTransformKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias            Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation @ref ActivationLayerInfo. Only RELU, BOUNDED_RELU, LU_BOUNDED_RELU, LEAKY_RELU and SOFT_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    bool             _is_nhwc;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H */
diff --git a/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index 4c92ae417f..0000000000
--- a/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
- */
-class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    ICLDepthwiseConvolutionLayer3x3Kernel()
-        : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDepthwiseConvolutionLayer3x3Kernel(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    ICLDepthwiseConvolutionLayer3x3Kernel(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
-
-protected:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    unsigned int     _conv_stride_y;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H */