From 7d61ff041826782d14e67b7f5b7a2864905ff38b Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Mon, 18 Jan 2021 21:15:59 +0000
Subject: Make all CL Concatenate kernels and functions state-less

Resolves COMPMID-3995

Change-Id: I84172bed20924f1d9ae3b4d14d7b321e9494296e
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4887
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 Android.bp                                         |  13 +-
 SConscript                                         |   4 +
 .../runtime/CL/functions/CLConcatenateLayer.h      |  69 +-----
 docs/00_introduction.dox                           |  20 +-
 src/core/CL/CLKernels.h                            |   8 +-
 .../CL/kernels/CLBatchConcatenateLayerKernel.cpp   | 150 ------------
 .../CL/kernels/CLBatchConcatenateLayerKernel.h     |  82 -------
 .../CL/kernels/CLDepthConcatenateLayerKernel.cpp   | 135 -----------
 .../CL/kernels/CLDepthConcatenateLayerKernel.h     |  80 -------
 .../CL/kernels/CLHeightConcatenateLayerKernel.cpp  | 128 -----------
 .../CL/kernels/CLHeightConcatenateLayerKernel.h    |  77 -------
 .../kernels/CLWidthConcatenate2TensorsKernel.cpp   | 150 ------------
 .../CL/kernels/CLWidthConcatenate2TensorsKernel.h  |  73 ------
 .../kernels/CLWidthConcatenate4TensorsKernel.cpp   | 180 ---------------
 .../CL/kernels/CLWidthConcatenate4TensorsKernel.h  |  77 -------
 .../CL/kernels/CLWidthConcatenateLayerKernel.cpp   | 123 ----------
 .../CL/kernels/CLWidthConcatenateLayerKernel.h     |  74 ------
 src/core/gpu/cl/ClCompileContext.h                 |  36 +++
 src/core/gpu/cl/IClKernel.h                        |  37 +++
 .../gpu/cl/kernels/ClBatchConcatenateKernel.cpp    | 156 +++++++++++++
 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h |  77 +++++++
 .../gpu/cl/kernels/ClDepthConcatenateKernel.cpp    | 141 ++++++++++++
 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h |  77 +++++++
 .../gpu/cl/kernels/ClHeightConcatenateKernel.cpp   | 134 +++++++++++
 .../gpu/cl/kernels/ClHeightConcatenateKernel.h     |  74 ++++++
 .../kernels/ClWidthConcatenate2TensorsKernel.cpp   | 156 +++++++++++++
 .../cl/kernels/ClWidthConcatenate2TensorsKernel.h  |  70 ++++++
 .../kernels/ClWidthConcatenate4TensorsKernel.cpp   | 186 +++++++++++++++
 .../cl/kernels/ClWidthConcatenate4TensorsKernel.h  |  75 ++++++
 .../gpu/cl/kernels/ClWidthConcatenateKernel.cpp    | 129 +++++++++++
 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h |  71 ++++++
 src/runtime/CL/functions/CLConcatenateLayer.cpp    | 243 +-------------------
 src/runtime/gpu/cl/IClOperator.h                   |  37 +++
 src/runtime/gpu/cl/operators/ClConcatenate.cpp     | 254 +++++++++++++++++++++
 src/runtime/gpu/cl/operators/ClConcatenate.h       |  86 +++++++
 35 files changed, 1839 insertions(+), 1643 deletions(-)
 delete mode 100644 src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
 delete mode 100644 src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
 delete mode 100644 src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
 delete mode 100644 src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
 delete mode 100644 src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
 delete mode 100644 src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
 create mode 100644 src/core/gpu/cl/ClCompileContext.h
 create mode 100644 src/core/gpu/cl/IClKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
 create mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
 create mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
 create mode 100644 src/runtime/gpu/cl/IClOperator.h
 create mode 100644 src/runtime/gpu/cl/operators/ClConcatenate.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClConcatenate.h

diff --git a/Android.bp b/Android.bp
index 41ed188e6d..4427bd4fee 100644
--- a/Android.bp
+++ b/Android.bp
@@ -82,7 +82,6 @@ cc_library_static {
         "src/core/CL/kernels/CLAccumulateKernel.cpp",
         "src/core/CL/kernels/CLActivationLayerKernel.cpp",
         "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp",
-        "src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp",
         "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp",
         "src/core/CL/kernels/CLBitwiseKernel.cpp",
@@ -101,7 +100,6 @@ cc_library_static {
         "src/core/CL/kernels/CLCropKernel.cpp",
         "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp",
         "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp",
-        "src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLDepthConvertLayerKernel.cpp",
         "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp",
         "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp",
@@ -145,7 +143,6 @@ cc_library_static {
         "src/core/CL/kernels/CLHOGDescriptorKernel.cpp",
         "src/core/CL/kernels/CLHOGDetectorKernel.cpp",
         "src/core/CL/kernels/CLHarrisCornersKernel.cpp",
-        "src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLHistogramKernel.cpp",
         "src/core/CL/kernels/CLIm2ColKernel.cpp",
         "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp",
@@ -197,9 +194,6 @@ cc_library_static {
         "src/core/CL/kernels/CLWarpAffineKernel.cpp",
         "src/core/CL/kernels/CLWarpPerspectiveKernel.cpp",
         "src/core/CL/kernels/CLWeightsReshapeKernel.cpp",
-        "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp",
-        "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp",
-        "src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp",
         "src/core/CL/kernels/CLWinogradInputTransformKernel.cpp",
         "src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp",
@@ -437,6 +431,12 @@ cc_library_static {
         "src/core/cpu/kernels/add/sve/qsymm16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp32.cpp",
+        "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp",
+        "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
+        "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
+        "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
+        "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
+        "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
         "src/core/helpers/SoftmaxHelpers.cpp",
         "src/core/helpers/WindowHelpers.cpp",
         "src/core/utils/ScaleUtils.cpp",
@@ -781,6 +781,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuFloor.cpp",
         "src/runtime/cpu/operators/CpuPermute.cpp",
         "src/runtime/cpu/operators/CpuReshape.cpp",
+        "src/runtime/gpu/cl/operators/ClConcatenate.cpp",
         "utils/CommonGraphOptions.cpp",
         "utils/GraphUtils.cpp",
         "utils/Utils.cpp",
diff --git a/SConscript b/SConscript
index 8b8e504832..121cf3220a 100644
--- a/SConscript
+++ b/SConscript
@@ -212,11 +212,15 @@ if env['opencl']:
     core_files += Glob('src/core/CL/gemm/native/*.cpp')
     core_files += Glob('src/core/CL/gemm/reshaped/*.cpp')
     core_files += Glob('src/core/CL/gemm/reshaped_only_rhs/*.cpp')
+    core_files += Glob('src/core/gpu/cl/*.cpp')
+    core_files += Glob('src/core/gpu/cl/kernels/*.cpp')
 
     runtime_files += Glob('src/runtime/CL/*.cpp')
     runtime_files += Glob('src/runtime/CL/functions/*.cpp')
     runtime_files += Glob('src/runtime/CL/gemm/*.cpp')
     runtime_files += Glob('src/runtime/CL/tuners/*.cpp')
+    runtime_files += Glob('src/runtime/gpu/cl/*.cpp')
+    runtime_files += Glob('src/runtime/gpu/cl/operators/*.cpp')
 
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index 5e7003a112..bfc8a39ac9 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_CLCONCATENATELAYER_H
 #define ARM_COMPUTE_CLCONCATENATELAYER_H
 
-#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/Types.h"
@@ -43,10 +42,10 @@ class Status;
 
 /** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
  *
- * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
- * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
- * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
+ * -# @ref opencl::kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref opencl::kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref opencl::kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref opencl::kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
  */
 class CLConcatenateLayer : public IFunction
 {
@@ -66,7 +65,8 @@ public:
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
@@ -76,7 +76,8 @@ public:
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
      * @param[in]     compile_context The compile context to be used.
      * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All
@@ -87,7 +88,8 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
      * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: All.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
@@ -104,54 +106,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-
-namespace experimental
-{
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
- * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
- * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
- */
-class CLConcatenation : public ICLOperator
-{
-public:
-    /** Default constructor */
-    CLConcatenation();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
-     *
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All
-     * @param[out]    output          Output tensor. Data types supported: Same as @p input.
-     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
-     */
-    void configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
-     *
-     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
-     *
-     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: All
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::vector<std::unique_ptr<ICLKernel>> _concat_kernels;
-    unsigned int                            _num_inputs;
-    unsigned int                            _axis;
-};
-} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCONCATENATELAYER_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 9c0020da66..af78a70abc 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -165,7 +165,7 @@ v20.11 Public major release
    - @ref NEGEMMLowpMatrixAReductionKernel
    - @ref NEGEMMLowpMatrixBReductionKernel
  - Removed padding from OpenCL kernels:
-   - @ref CLBatchConcatenateLayerKernel
+   - CLBatchConcatenateLayerKernel
    - @ref CLElementwiseOperationKernel
    - @ref CLBatchNormalizationLayerKernel
    - @ref CLPoolingLayerKernel
@@ -184,17 +184,17 @@ v20.11 Public major release
    - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
    - @ref CLActivationLayerKernel
    - @ref CLWinogradFilterTransformKernel
-   - @ref CLWidthConcatenateLayerKernel
-   - @ref CLWidthConcatenate4TensorsKernel
-   - @ref CLWidthConcatenate2TensorsKernel
+   - CLWidthConcatenateLayerKernel
+   - CLWidthConcatenate4TensorsKernel
+   - CLWidthConcatenate2TensorsKernel
    - @ref CLLogits1DMaxShiftExpSumKernel
    - @ref CLLogits1DNormKernel
-   - @ref CLHeightConcatenateLayerKernel
+   - CLHeightConcatenateLayerKernel
    - @ref CLGEMMMatrixMultiplyKernel
    - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
    - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
    - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-   - @ref CLDepthConcatenateLayerKernel
+   - CLDepthConcatenateLayerKernel
    - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
  - Removed OpenCL kernels / functions:
    - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
@@ -721,7 +721,7 @@ v19.08 Public major release
     - @ref CLNegLayer
     - @ref CLPReluLayer
     - @ref CLSinLayer
-    - @ref CLBatchConcatenateLayerKernel
+    - CLBatchConcatenateLayerKernel
     - @ref CLDepthToSpaceLayerKernel / @ref CLDepthToSpaceLayer
     - @ref CLGEMMLowpMatrixMultiplyNativeKernel
     - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
@@ -773,7 +773,7 @@ v19.05 Public major release
     - @ref CLFFTScaleKernel
     - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
     - @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-    - @ref CLHeightConcatenateLayerKernel
+    - CLHeightConcatenateLayerKernel
     - @ref CLDirectDeconvolutionLayer
     - @ref CLFFT1D
     - @ref CLFFT2D
@@ -1011,7 +1011,7 @@ v18.05 Public major release
     - @ref CLCopy / @ref CLCopyKernel
     - @ref CLLSTMLayer
     - @ref CLRNNLayer
-    - CLWidthConcatenateLayer / @ref CLWidthConcatenateLayerKernel
+    - CLWidthConcatenateLayer / CLWidthConcatenateLayerKernel
     - @ref CLWinogradFilterTransformKernel / @ref CLWinogradInputTransformKernel / @ref CLWinogradConvolutionLayer
     - @ref CLWinogradInputTransformKernel / @ref CLWinogradInputTransform
  - New Neon kernels / functions:
@@ -1220,7 +1220,7 @@ v17.06 Public major release
  - User can specify his own scheduler by implementing the @ref IScheduler interface.
  - New OpenCL kernels / functions:
     - @ref CLBatchNormalizationLayerKernel / @ref CLBatchNormalizationLayer
-    - @ref CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
+    - CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
     - @ref CLHOGOrientationBinningKernel @ref CLHOGBlockNormalizationKernel, @ref CLHOGDetectorKernel / @ref CLHOGDescriptor @ref CLHOGDetector @ref CLHOGGradient @ref CLHOGMultiDetection
     - CLLocallyConnectedMatrixMultiplyKernel / CLLocallyConnectedLayer
     - @ref CLWeightsReshapeKernel / @ref CLConvolutionLayerReshapeWeights
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index f23871d4db..11f1d2d7cf 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
 #include "src/core/CL/kernels/CLAccumulateKernel.h"
 #include "src/core/CL/kernels/CLActivationLayerKernel.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "src/core/CL/kernels/CLBitwiseKernel.h"
@@ -48,7 +47,6 @@
 #include "src/core/CL/kernels/CLCropKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
@@ -92,7 +90,6 @@
 #include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "src/core/CL/kernels/CLHOGDetectorKernel.h"
 #include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLHistogramKernel.h"
 #include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
@@ -144,9 +141,6 @@
 #include "src/core/CL/kernels/CLWarpAffineKernel.h"
 #include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 #include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
 #include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
deleted file mode 100644
index ccd6a5a0fc..0000000000
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimZ) != output->dimension(Window::DimZ));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) + batch_offset > output->dimension(3));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, input, output);
-
-    return Status{};
-}
-} // namespace
-
-CLBatchConcatenateLayerKernel::CLBatchConcatenateLayerKernel()
-    : _batch_offset(0)
-{
-}
-
-void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, batch_offset, output));
-
-    auto padding_info = get_padding_info({ input, output });
-
-    _batch_offset = batch_offset;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    win.set(3, Window::Dimension(0, input->tensor_shape()[3], 1));
-    ICLKernel::configure_internal(win);
-
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_";
-    _config_id += support::cpp11::to_string(3);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(batch_offset);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(3));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
-                                               unsigned int                    batch_offset,
-                                               const arm_compute::ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, batch_offset, output));
-    return Status{};
-}
-
-void CLBatchConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_3D();
-
-    const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
deleted file mode 100644
index 54a89eb243..0000000000
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLBatchConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBatchConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel &operator=(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel(CLBatchConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel &operator=(CLBatchConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     batch_offset    The offset on axis # 3.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _batch_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
deleted file mode 100644
index eb5bfc2d86..0000000000
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
-    return Status{};
-}
-} // namespace
-
-CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
-    : _depth_offset(0)
-{
-}
-
-void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, depth_offset, output));
-
-    auto padding_info = get_padding_info({ input, output });
-
-    _depth_offset = depth_offset;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
-    ICLKernel::configure_internal(win);
-
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
-                                               unsigned int                    depth_offset,
-                                               const arm_compute::ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, depth_offset, output));
-    return Status{};
-}
-
-void CLDepthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_3D();
-
-    const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
deleted file mode 100644
index 6c73bd4bf4..0000000000
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLDepthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel &operator=(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel(CLDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset    The offset on the Z axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _depth_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H */
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
deleted file mode 100644
index 8aa7366d50..0000000000
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) + height_offset > output->dimension(Window::DimY));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
-    : _height_offset(0)
-{
-}
-
-Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
-    return Status{};
-}
-
-void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output));
-
-    auto padding_info = get_padding_info({ input, output });
-
-    _height_offset = height_offset;
-
-    // Add build options
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->dimension(0));
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
-
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
-    // Configure kernel window
-
-    // The window needs to be based on input as we copy all the heights of input
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_4D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
deleted file mode 100644
index f4cb627052..0000000000
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the height concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLHeightConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHeightConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel &operator=(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel(CLHeightConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel &operator=(CLHeightConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLHeightConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[in]  height_offset   The starting offset on the Y axis for the output tensor.
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLHeightConcatenateLayerKernel
-     *
-     * @param[in] input         Input tensor info. Data types supported: All.
-     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _height_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
deleted file mode 100644
index d6697ba46b..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/tensor_info.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) > output->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
-    return Status{};
-}
-
-void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output));
-
-    auto padding_info = get_padding_info({ input1, input2, output });
-
-    const unsigned int min_dimension                     = std::min(input1->dimension(0), input2->dimension(0));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = output->dimension(0) % num_elems_processed_per_iteration;
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-
-    // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2);
-    if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo)
-    {
-        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_width_x2_";
-    _config_id += lower_string(string_from_data_type(input1->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(1));
-}
-
-void CLWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_4D();
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src0, slice);
-        add_4D_tensor_argument(idx, src1, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
deleted file mode 100644
index 2af89e12eb..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the width concatenate kernel of 2 tensors.
- *  The input1 and input2 tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate2TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate2TensorsKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel &operator=(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate2TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
deleted file mode 100644
index 7ecdd30224..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/tensor_info.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input3->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input4->dimension(i) != output->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
-{
-}
-
-Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
-    return Status{};
-}
-
-void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *input1, ITensorInfo *input2,
-                                                 ITensorInfo *input3, ITensorInfo *input4,
-                                                 ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, input3, input4, output));
-
-    auto               padding_info                      = get_padding_info({ input1, input2, input3, input4, output });
-    const unsigned int min_dimension                     = std::min(std::min(input1->dimension(0), input2->dimension(0)), std::min(input3->dimension(0), input4->dimension(0)));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = output->dimension(0) % num_elems_processed_per_iteration;
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0)));
-    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->dimension(0)));
-    build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(input4->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) + input2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) + input2->dimension(0) + input3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-
-    // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2, input3, input4);
-    if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo)
-    {
-        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
-        const UniformQuantizationInfo iq3_info = input3->quantization_info().uniform();
-        const UniformQuantizationInfo iq4_info = input4->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset));
-        build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale));
-        build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset));
-        build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_width_x4_";
-    _config_id += lower_string(string_from_data_type(input1->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input3->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input3->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input4->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input4->dimension(1));
-}
-
-void CLWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
-    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_4D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src0, slice);
-        add_4D_tensor_argument(idx, src1, slice);
-        add_4D_tensor_argument(idx, src2, slice);
-        add_4D_tensor_argument(idx, src3, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
deleted file mode 100644
index 0caf87114d..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the width concatenate kernel of 4 tensors.
- *  All input tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate4TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate4TensorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel &operator=(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate4TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[in]  input3          Third input tensor. Data types supported: same as @p input1
-     * @param[in]  input4          Fourth input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] input3 Third tensor info. Data types supported: same as @p input1
-     * @param[in] input4 Fourth tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
deleted file mode 100644
index 30d0a481bd..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) + width_offset > output->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel()
-{
-}
-
-Status CLWidthConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, width_offset, output));
-    return Status{};
-}
-
-void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, width_offset, output));
-
-    auto padding_info = get_padding_info({ input, output });
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, input->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
-
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
-    {
-        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLWidthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_4D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
deleted file mode 100644
index 09c3f4455d..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the width concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLWidthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel &operator=(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel(CLWidthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel &operator=(CLWidthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     width_offset    The offset on the X axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/src/core/gpu/cl/ClCompileContext.h b/src/core/gpu/cl/ClCompileContext.h
new file mode 100644
index 0000000000..e69cc0200f
--- /dev/null
+++ b/src/core/gpu/cl/ClCompileContext.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COMPILE_CONTEXT_H
+#define ARM_COMPUTE_CL_COMPILE_CONTEXT_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using ClCompileContext = arm_compute::CLCompileContext;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COMPILE_CONTEXT_H */
diff --git a/src/core/gpu/cl/IClKernel.h b/src/core/gpu/cl/IClKernel.h
new file mode 100644
index 0000000000..52ea3c9183
--- /dev/null
+++ b/src/core/gpu/cl/IClKernel.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_KERNEL_H
+#define ARM_COMPUTE_ICL_KERNEL_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using IClKernel = arm_compute::ICLKernel;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
new file mode 100644
index 0000000000..c16ff1f028
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
+
+    return Status{};
+}
+} // namespace
+
+ClBatchConcatenateKernel::ClBatchConcatenateKernel()
+    : _batch_offset(0)
+{
+}
+
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
+
+    auto padding_info = get_padding_info({ src, dst });
+
+    _batch_offset = batch_offset;
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
+
+    // Configure kernel window
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1));
+    ICLKernel::configure_internal(win);
+
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_";
+    _config_id += support::cpp11::to_string(3);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(batch_offset);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(3));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+                                          unsigned int                    batch_offset,
+                                          const arm_compute::ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
+    return Status{};
+}
+
+void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice = window.first_slice_window_3D();
+
+    const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace opencl
+} // namespace kernels
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
new file mode 100644
index 0000000000..378a08aa4f
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the batch concatenate kernel.
+ *  The src tensor will be concatenated into the destination tensor.
+ */
+class ClBatchConcatenateKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClBatchConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor. Data types supported: All.
+     * @param[in]     batch_offset    The offset on axis # 3.
+     * @param[in,out] dst             Destination tensor. Data types supported: Same as @p src.
+     *
+     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClBatchConcatenateKernel
+     *
+     * @param[in] src          Input tensor info. Data types supported: All.
+     * @param[in] batch_offset The offset on axis # 3.
+     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _batch_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..e8893d76d2
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst);
+
+    return Status{};
+}
+} // namespace
+
+ClDepthConcatenateKernel::ClDepthConcatenateKernel()
+    : _depth_offset(0)
+{
+}
+
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
+
+    auto padding_info = get_padding_info({ src, dst });
+
+    _depth_offset = depth_offset;
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
+
+    // Configure kernel window
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1));
+    ICLKernel::configure_internal(win);
+
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+                                          unsigned int                    depth_offset,
+                                          const arm_compute::ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
+    return Status{};
+}
+
+void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice = window.first_slice_window_3D();
+
+    const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
new file mode 100644
index 0000000000..144d7d48f2
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the depth concatenate kernel.
+ *  The src tensor will be concatenated into the dst tensor.
+ */
+class ClDepthConcatenateKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    ClDepthConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]     depth_offset    The offset on the Z axis.
+     * @param[in,out] dst             Destination tensor. Data types supported: Same as @p src.
+     *
+     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClDepthConcatenateKernel
+     *
+     * @param[in] src          Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] depth_offset The offset on the Z axis.
+     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _depth_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
new file mode 100644
index 0000000000..83e976e10f
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+ClHeightConcatenateKernel::ClHeightConcatenateKernel()
+    : _height_offset(0)
+{
+}
+
+Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
+    return Status{};
+}
+
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
+
+    auto padding_info = get_padding_info({ src, dst });
+
+    _height_offset = height_offset;
+
+    // Add build options
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
+    // Configure kernel window
+
+    // The window needs to be based on src as we copy all the heights of src
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, src, window);
+    add_4D_tensor_argument(idx, dst, window);
+    enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
new file mode 100644
index 0000000000..88cd4c4d17
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H
+#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the height concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class ClHeightConcatenateKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClHeightConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data types supported: All.
+     * @param[in]  height_offset   The starting offset on the Y axis for the dst tensor.
+     * @param[out] dst             Destination tensor. Data types supported: same as @p src.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClHeightConcatenateKernel
+     *
+     * @param[in] src           Source tensor info. Data types supported: All.
+     * @param[in] height_offset The starting offset on the Y axis for the dst tensor.
+     * @param[in] dst           Destination tensor info. Data types supported: same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _height_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
new file mode 100644
index 0000000000..6a2ab3b50f
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
+    return Status{};
+}
+
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
+
+    auto padding_info = get_padding_info({ src1, src2, dst });
+
+    const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+
+    // If input have different quantization info set quantization parameters needed for the re-quantization process
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
+    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    {
+        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x2_";
+    _config_id += lower_string(string_from_data_type(src1->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(1));
+}
+
+void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_4D();
+
+    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src0, slice);
+        add_4D_tensor_argument(idx, src1, slice);
+        add_4D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, window, lws_hint());
+    }
+    while(window.slide_window_slice_4D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
new file mode 100644
index 0000000000..92715008cf
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 2 tensors.
+ *  The src1 and src2 tensors will be concatenated into the dst tensor.
+ */
+class ClWidthConcatenate2TensorsKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClWidthConcatenate2TensorsKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel);
+    /** Initialise the kernel's sources and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor. Data types supported: All.
+     * @param[in]  src2            Second source tensor. Data types supported: same as @p src1
+     * @param[out] dst             Destination tensor. Data types supported: Same as @p src1.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenate2TensorsKernel
+     *
+     * @param[in] src1 First tensor info. Data types supported: All.
+     * @param[in] src2 Second tensor info. Data types supported: same as @p src1
+     * @param[in] dst  Destination tensor info. Data types supported: Same as @p src1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
new file mode 100644
index 0000000000..4b49652a73
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
+{
+}
+
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
+    return Status{};
+}
+
+void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo *src1, ITensorInfo *src2,
+                                                 ITensorInfo *src3, ITensorInfo *src4,
+                                                 ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
+
+    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
+    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
+    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(src3->dimension(0)));
+    build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(src4->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+
+    // If soources have different quantization info set quantization parameters needed for the re-quantization process
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    {
+        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+        const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform();
+        const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+        build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset));
+        build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale));
+        build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset));
+        build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x4_";
+    _config_id += lower_string(string_from_data_type(src1->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src3->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src3->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src4->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src4->dimension(1));
+}
+
+void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice = window.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src0, slice);
+        add_4D_tensor_argument(idx, src1, slice);
+        add_4D_tensor_argument(idx, src2, slice);
+        add_4D_tensor_argument(idx, src3, slice);
+        add_4D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, window, lws_hint());
+    }
+    while(window.slide_window_slice_4D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
new file mode 100644
index 0000000000..06d6c0399a
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 4 tensors.
+ *  All source tensors will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenate4TensorsKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClWidthConcatenate4TensorsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel);
+    /** Initialise the kernel's sources and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor. Data types supported: All.
+     * @param[in]  src2            Second source tensor. Data types supported: same as @p src1
+     * @param[in]  src3            Third source tensor. Data types supported: same as @p src1
+     * @param[in]  src4            Fourth source tensor. Data types supported: same as @p src1
+     * @param[out] dst             Destination tensor. Data types supported: same as @p src1.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenate4TensorsKernel
+     *
+     * @param[in] src1 First tensor info. Data types supported: All.
+     * @param[in] src2 Second tensor info. Data types supported: same as @p src1
+     * @param[in] src3 Third tensor info. Data types supported: same as @p src1
+     * @param[in] src4 Fourth tensor info. Data types supported: same as @p src1
+     * @param[in] dst  Destination tensor info. Data types supported: same as @p src1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
new file mode 100644
index 0000000000..8cbbc27444
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+ClWidthConcatenateKernel::ClWidthConcatenateKernel()
+{
+}
+
+Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
+    return Status{};
+}
+
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
+
+    auto padding_info = get_padding_info({ src, dst });
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
+
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, src, window);
+    add_4D_tensor_argument(idx, dst, window);
+    enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
new file mode 100644
index 0000000000..3bffe52700
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenateKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClWidthConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor. Data types supported: All.
+     * @param[in]     width_offset    The offset on the X axis.
+     * @param[in,out] dst             Destination tensor. Data types supported: same as @p src.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenateKernel
+     *
+     * @param[in] src          Source tensor info. Data types supported: All.
+     * @param[in] width_offset The offset on the X axis.
+     * @param[in] dst          Destination tensor info. Data types supported: same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H */
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0c473a79c8..ea96e45bf8 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,242 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
-namespace experimental
-{
-CLConcatenation::CLConcatenation()
-    : _concat_kernels(),
-      _num_inputs(0),
-      _axis(Window::DimX)
-{
-}
-
-void CLConcatenation::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis)
-{
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-    _axis       = axis;
-    _num_inputs = inputs_vector.size();
-
-    TensorShape                      output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
-    std::vector<const ITensorInfo *> const_inputs_vector(inputs_vector.size());
-    std::transform(inputs_vector.begin(), inputs_vector.end(), const_inputs_vector.begin(), [](ITensorInfo * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t;
-    });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, inputs_vector[0]->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(const_inputs_vector, output, axis));
-
-    unsigned int offset = 0;
-    switch(_axis)
-    {
-        case Window::DimX:
-        {
-            switch(_num_inputs)
-            {
-                case 2:
-                {
-                    // Configure WidthConcatenate2Tensors kernel
-                    auto kernel = std::make_unique<CLWidthConcatenate2TensorsKernel>();
-                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), output);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                case 4:
-                {
-                    // Configure WidthConcatenate4Tensors kernel
-                    auto kernel = std::make_unique<CLWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                default:
-                {
-                    // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
-                    {
-                        auto kernel = std::make_unique<CLWidthConcatenateLayerKernel>();
-                        kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                        offset += inputs_vector.at(i)->dimension(_axis);
-                        _concat_kernels.emplace_back(std::move(kernel));
-                    }
-                    break;
-                }
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<CLHeightConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<CLDepthConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case 3:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<CLBatchConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-}
-
-Status CLConcatenation::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
-    const unsigned int num_inputs = inputs_vector.size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
-    unsigned int offset = 0;
-    switch(axis)
-    {
-        case Window::DimX:
-        {
-            switch(num_inputs)
-            {
-                case 2:
-                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output));
-                    break;
-                case 4:
-                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output));
-                    break;
-                default:
-                    // Validate generic case of WidthConcatenate kernel
-                    for(const auto &input : inputs_vector)
-                    {
-                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-                        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output));
-                        offset += input->dimension(axis);
-                    }
-                    break;
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        case 3:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLBatchConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-
-    if(output->total_size() != 0)
-    {
-        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-
-void CLConcatenation::run(ITensorPack &tensors)
-{
-    if(tensors.empty())
-    {
-        ARM_COMPUTE_ERROR("No inputs provided");
-    }
-
-    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
-    {
-        ARM_COMPUTE_ERROR("Configured with different number of inputs");
-    }
-
-    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
-    {
-        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
-        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
-    }
-    else
-    {
-        int i = 0;
-        for(auto &k : _concat_kernels)
-        {
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
-            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
-            CLScheduler::get().enqueue_op(*k, pack, true);
-            ++i;
-        }
-    }
-}
-} // namespace experimental
-
 struct CLConcatenateLayer::Impl
 {
-    std::vector<const ICLTensor *>                 srcs{};
-    ICLTensor                                     *dst{ nullptr };
-    unsigned int                                   num_inputs{ 0 };
-    unsigned int                                   axis{ 0 };
-    std::unique_ptr<experimental::CLConcatenation> op{ nullptr };
+    std::vector<const ICLTensor *>         srcs{};
+    ICLTensor                             *dst{ nullptr };
+    unsigned int                           num_inputs{ 0 };
+    unsigned int                           axis{ 0 };
+    std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
 };
 
 CLConcatenateLayer::CLConcatenateLayer()
@@ -285,7 +62,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
     _impl->dst        = output;
     _impl->axis       = axis;
     _impl->num_inputs = inputs_vector.size();
-    _impl->op         = std::make_unique<experimental::CLConcatenation>();
+    _impl->op         = std::make_unique<opencl::ClConcatenate>();
 
     std::vector<ITensorInfo *> inputs_vector_info;
     for(unsigned int i = 0; i < inputs_vector.size(); ++i)
@@ -298,7 +75,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
 
 Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
-    return experimental::CLConcatenation::validate(inputs_vector, output, axis);
+    return opencl::ClConcatenate::validate(inputs_vector, output, axis);
 }
 
 void CLConcatenateLayer::run()
diff --git a/src/runtime/gpu/cl/IClOperator.h b/src/runtime/gpu/cl/IClOperator.h
new file mode 100644
index 0000000000..049bf05dc1
--- /dev/null
+++ b/src/runtime/gpu/cl/IClOperator.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_OPERATOR_H
+#define ARM_COMPUTE_ICL_OPERATOR_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using IClOperator = experimental::ICLOperator;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_OPERATOR_H */
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.cpp b/src/runtime/gpu/cl/operators/ClConcatenate.cpp
new file mode 100644
index 0000000000..4385fcfaed
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClConcatenate.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+ClConcatenate::ClConcatenate()
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimX)
+{
+}
+
+void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_ERROR_ON(dst == nullptr);
+    _axis       = axis;
+    _num_inputs = src_vector.size();
+
+    TensorShape                      dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t;
+    });
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
+
+    unsigned int offset = 0;
+    switch(_axis)
+    {
+        case Window::DimX:
+        {
+            switch(_num_inputs)
+            {
+                case 2:
+                {
+                    // Configure WidthConcatenate2Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                case 4:
+                {
+                    // Configure WidthConcatenate4Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                default:
+                {
+                    // Configure generic case WidthConcatenate kernels
+                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    {
+                        auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
+                        kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                        offset += src_vector.at(i)->dimension(_axis);
+                        _concat_kernels.emplace_back(std::move(kernel));
+                    }
+                    break;
+                }
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case 3:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+
+Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr);
+    const unsigned int num_inputs = src_vector.size();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+    unsigned int offset = 0;
+    switch(axis)
+    {
+        case Window::DimX:
+        {
+            switch(num_inputs)
+            {
+                case 2:
+                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    break;
+                case 4:
+                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    break;
+                default:
+                    // Validate generic case of WidthConcatenate kernel
+                    for(const auto &src : src_vector)
+                    {
+                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+                        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
+                        offset += src->dimension(axis);
+                    }
+                    break;
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for(const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for(const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case 3:
+        {
+            for(const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+
+    if(dst->total_size() != 0)
+    {
+        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
+    }
+
+    return Status{};
+}
+
+void ClConcatenate::run(ITensorPack &tensors)
+{
+    if(tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    {
+        ARM_COMPUTE_ERROR("Configured with different number of inputs");
+    }
+
+    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    {
+        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
+        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
+    }
+    else
+    {
+        int i = 0;
+        for(auto &k : _concat_kernels)
+        {
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+            CLScheduler::get().enqueue_op(*k, pack, true);
+            ++i;
+        }
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.h b/src/runtime/gpu/cl/operators/ClConcatenate.h
new file mode 100644
index 0000000000..112e2ac6b7
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClConcatenate.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONCATENATE_H
+#define ARM_COMPUTE_CLCONCATENATE_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
+ */
+class ClConcatenate : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClConcatenate();
+    /** Initialise the kernel's inputs vector and dst.
+     *
+     * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
+     *       @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
+     *
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] src_vector      The vectors containing all the tensors to concatenate. Data types supported: All
+     * @param[out]    dst             Destination tensor. Data types supported: same as @p src_vector.
+     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     */
+    void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
+    /** Static function to check if given info will lead to a valid configuration of @ref ClConcatenate
+     *
+     * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
+     *       @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
+     *
+     * @param[in] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All
+     * @param[in] dst        Destination tensor info. Data types supported: same as @p src_vector.
+     * @param[in] axis       Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::vector<std::unique_ptr<IClKernel>> _concat_kernels;
+    unsigned int                            _num_inputs;
+    unsigned int                            _axis;
+};
+} // namespace opencl
+} // namespace arm_comPUTE
+#endif /* ARM_COMPUTE_CL_CONCATENATE_H */
-- 
cgit v1.2.1