Make all CL Concatenate kernels and functions state-less

Resolves COMPMID-3995 Change-Id: I84172bed20924f1d9ae3b4d14d7b321e9494296e Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4887 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2021-01-18 21:15:59 +0000
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-01-20 16:28:27 +0000
commit: 7d61ff041826782d14e67b7f5b7a2864905ff38b (patch)
tree: 2e69c8a5fdabc6717b0691acdbbe7374d856902f
parent: da6a6eb3bc06ce8869ae3290853970d4c0ce412e (diff)
download: ComputeLibrary-7d61ff041826782d14e67b7f5b7a2864905ff38b.tar.gz
29 files changed, 1195 insertions, 999 deletions
diff --git a/Android.bp b/Android.bp
index 41ed188e6d..4427bd4fee 100644
--- a/Android.bp
+++ b/Android.bp
@@ -82,7 +82,6 @@ cc_library_static {
         "src/core/CL/kernels/CLAccumulateKernel.cpp",
         "src/core/CL/kernels/CLActivationLayerKernel.cpp",
         "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp",
-        "src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp",
         "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp",
         "src/core/CL/kernels/CLBitwiseKernel.cpp",
@@ -101,7 +100,6 @@ cc_library_static {
         "src/core/CL/kernels/CLCropKernel.cpp",
         "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp",
         "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp",
-        "src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLDepthConvertLayerKernel.cpp",
         "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp",
         "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp",
@@ -145,7 +143,6 @@ cc_library_static {
         "src/core/CL/kernels/CLHOGDescriptorKernel.cpp",
         "src/core/CL/kernels/CLHOGDetectorKernel.cpp",
         "src/core/CL/kernels/CLHarrisCornersKernel.cpp",
-        "src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLHistogramKernel.cpp",
         "src/core/CL/kernels/CLIm2ColKernel.cpp",
         "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp",
@@ -197,9 +194,6 @@ cc_library_static {
         "src/core/CL/kernels/CLWarpAffineKernel.cpp",
         "src/core/CL/kernels/CLWarpPerspectiveKernel.cpp",
         "src/core/CL/kernels/CLWeightsReshapeKernel.cpp",
-        "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp",
-        "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp",
-        "src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp",
         "src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp",
         "src/core/CL/kernels/CLWinogradInputTransformKernel.cpp",
         "src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp",
@@ -437,6 +431,12 @@ cc_library_static {
         "src/core/cpu/kernels/add/sve/qsymm16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp16.cpp",
         "src/core/cpu/kernels/floor/NEON/fp32.cpp",
+        "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp",
+        "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
+        "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
+        "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
+        "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
+        "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
         "src/core/helpers/SoftmaxHelpers.cpp",
         "src/core/helpers/WindowHelpers.cpp",
         "src/core/utils/ScaleUtils.cpp",
@@ -781,6 +781,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuFloor.cpp",
         "src/runtime/cpu/operators/CpuPermute.cpp",
         "src/runtime/cpu/operators/CpuReshape.cpp",
+        "src/runtime/gpu/cl/operators/ClConcatenate.cpp",
         "utils/CommonGraphOptions.cpp",
         "utils/GraphUtils.cpp",
         "utils/Utils.cpp",
diff --git a/SConscript b/SConscript
index 8b8e504832..121cf3220a 100644
--- a/SConscript
+++ b/SConscript
@@ -212,11 +212,15 @@ if env['opencl']:
     core_files += Glob('src/core/CL/gemm/native/*.cpp')
     core_files += Glob('src/core/CL/gemm/reshaped/*.cpp')
     core_files += Glob('src/core/CL/gemm/reshaped_only_rhs/*.cpp')
+    core_files += Glob('src/core/gpu/cl/*.cpp')
+    core_files += Glob('src/core/gpu/cl/kernels/*.cpp')
 
     runtime_files += Glob('src/runtime/CL/*.cpp')
     runtime_files += Glob('src/runtime/CL/functions/*.cpp')
     runtime_files += Glob('src/runtime/CL/gemm/*.cpp')
     runtime_files += Glob('src/runtime/CL/tuners/*.cpp')
+    runtime_files += Glob('src/runtime/gpu/cl/*.cpp')
+    runtime_files += Glob('src/runtime/gpu/cl/operators/*.cpp')
 
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index 5e7003a112..bfc8a39ac9 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_CLCONCATENATELAYER_H
 #define ARM_COMPUTE_CLCONCATENATELAYER_H
 
-#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/Types.h"
@@ -43,10 +42,10 @@ class Status;
 
 /** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
  *
- * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
- * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
- * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
+ * -# @ref opencl::kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref opencl::kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref opencl::kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref opencl::kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
  */
 class CLConcatenateLayer : public IFunction
 {
@@ -66,7 +65,8 @@ public:
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
@@ -76,7 +76,8 @@ public:
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
      * @param[in]     compile_context The compile context to be used.
      * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All
@@ -87,7 +88,8 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
      * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: All.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
@@ -104,54 +106,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-
-namespace experimental
-{
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
- * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
- * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
- */
-class CLConcatenation : public ICLOperator
-{
-public:
-    /** Default constructor */
-    CLConcatenation();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
-     *
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All
-     * @param[out]    output          Output tensor. Data types supported: Same as @p input.
-     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
-     */
-    void configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
-     *
-     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
-     *
-     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: All
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::vector<std::unique_ptr<ICLKernel>> _concat_kernels;
-    unsigned int                            _num_inputs;
-    unsigned int                            _axis;
-};
-} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCONCATENATELAYER_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 9c0020da66..af78a70abc 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -165,7 +165,7 @@ v20.11 Public major release
    - @ref NEGEMMLowpMatrixAReductionKernel
    - @ref NEGEMMLowpMatrixBReductionKernel
  - Removed padding from OpenCL kernels:
-   - @ref CLBatchConcatenateLayerKernel
+   - CLBatchConcatenateLayerKernel
    - @ref CLElementwiseOperationKernel
    - @ref CLBatchNormalizationLayerKernel
    - @ref CLPoolingLayerKernel
@@ -184,17 +184,17 @@ v20.11 Public major release
    - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
    - @ref CLActivationLayerKernel
    - @ref CLWinogradFilterTransformKernel
-   - @ref CLWidthConcatenateLayerKernel
-   - @ref CLWidthConcatenate4TensorsKernel
-   - @ref CLWidthConcatenate2TensorsKernel
+   - CLWidthConcatenateLayerKernel
+   - CLWidthConcatenate4TensorsKernel
+   - CLWidthConcatenate2TensorsKernel
    - @ref CLLogits1DMaxShiftExpSumKernel
    - @ref CLLogits1DNormKernel
-   - @ref CLHeightConcatenateLayerKernel
+   - CLHeightConcatenateLayerKernel
    - @ref CLGEMMMatrixMultiplyKernel
    - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
    - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
    - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-   - @ref CLDepthConcatenateLayerKernel
+   - CLDepthConcatenateLayerKernel
    - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
  - Removed OpenCL kernels / functions:
    - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
@@ -721,7 +721,7 @@ v19.08 Public major release
     - @ref CLNegLayer
     - @ref CLPReluLayer
     - @ref CLSinLayer
-    - @ref CLBatchConcatenateLayerKernel
+    - CLBatchConcatenateLayerKernel
     - @ref CLDepthToSpaceLayerKernel / @ref CLDepthToSpaceLayer
     - @ref CLGEMMLowpMatrixMultiplyNativeKernel
     - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
@@ -773,7 +773,7 @@ v19.05 Public major release
     - @ref CLFFTScaleKernel
     - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
     - @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-    - @ref CLHeightConcatenateLayerKernel
+    - CLHeightConcatenateLayerKernel
     - @ref CLDirectDeconvolutionLayer
     - @ref CLFFT1D
     - @ref CLFFT2D
@@ -1011,7 +1011,7 @@ v18.05 Public major release
     - @ref CLCopy / @ref CLCopyKernel
     - @ref CLLSTMLayer
     - @ref CLRNNLayer
-    - CLWidthConcatenateLayer / @ref CLWidthConcatenateLayerKernel
+    - CLWidthConcatenateLayer / CLWidthConcatenateLayerKernel
     - @ref CLWinogradFilterTransformKernel / @ref CLWinogradInputTransformKernel / @ref CLWinogradConvolutionLayer
     - @ref CLWinogradInputTransformKernel / @ref CLWinogradInputTransform
  - New Neon kernels / functions:
@@ -1220,7 +1220,7 @@ v17.06 Public major release
  - User can specify his own scheduler by implementing the @ref IScheduler interface.
  - New OpenCL kernels / functions:
     - @ref CLBatchNormalizationLayerKernel / @ref CLBatchNormalizationLayer
-    - @ref CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
+    - CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
     - @ref CLHOGOrientationBinningKernel @ref CLHOGBlockNormalizationKernel, @ref CLHOGDetectorKernel / @ref CLHOGDescriptor @ref CLHOGDetector @ref CLHOGGradient @ref CLHOGMultiDetection
     - CLLocallyConnectedMatrixMultiplyKernel / CLLocallyConnectedLayer
     - @ref CLWeightsReshapeKernel / @ref CLConvolutionLayerReshapeWeights
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index f23871d4db..11f1d2d7cf 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
 #include "src/core/CL/kernels/CLAccumulateKernel.h"
 #include "src/core/CL/kernels/CLActivationLayerKernel.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "src/core/CL/kernels/CLBitwiseKernel.h"
@@ -48,7 +47,6 @@
 #include "src/core/CL/kernels/CLCropKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
@@ -92,7 +90,6 @@
 #include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "src/core/CL/kernels/CLHOGDetectorKernel.h"
 #include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLHistogramKernel.h"
 #include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
@@ -144,9 +141,6 @@
 #include "src/core/CL/kernels/CLWarpAffineKernel.h"
 #include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 #include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
 #include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
deleted file mode 100644
index 54a89eb243..0000000000
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLBatchConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBatchConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel &operator=(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel(CLBatchConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel &operator=(CLBatchConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     batch_offset    The offset on axis # 3.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _batch_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
deleted file mode 100644
index 6c73bd4bf4..0000000000
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLDepthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel &operator=(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel(CLDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset    The offset on the Z axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _depth_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H */
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
deleted file mode 100644
index f4cb627052..0000000000
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the height concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLHeightConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHeightConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel &operator=(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel(CLHeightConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel &operator=(CLHeightConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLHeightConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[in]  height_offset   The starting offset on the Y axis for the output tensor.
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLHeightConcatenateLayerKernel
-     *
-     * @param[in] input         Input tensor info. Data types supported: All.
-     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _height_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
deleted file mode 100644
index 2af89e12eb..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the width concatenate kernel of 2 tensors.
- *  The input1 and input2 tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate2TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate2TensorsKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel &operator=(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate2TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
deleted file mode 100644
index 0caf87114d..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the width concatenate kernel of 4 tensors.
- *  All input tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate4TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate4TensorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel &operator=(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate4TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[in]  input3          Third input tensor. Data types supported: same as @p input1
-     * @param[in]  input4          Fourth input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] input3 Third tensor info. Data types supported: same as @p input1
-     * @param[in] input4 Fourth tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
deleted file mode 100644
index 09c3f4455d..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface for the width concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLWidthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel &operator=(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel(CLWidthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel &operator=(CLWidthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     width_offset    The offset on the X axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/src/core/gpu/cl/ClCompileContext.h b/src/core/gpu/cl/ClCompileContext.h
new file mode 100644
index 0000000000..e69cc0200f
--- /dev/null
+++ b/src/core/gpu/cl/ClCompileContext.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COMPILE_CONTEXT_H
+#define ARM_COMPUTE_CL_COMPILE_CONTEXT_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using ClCompileContext = arm_compute::CLCompileContext;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COMPILE_CONTEXT_H */
diff --git a/src/core/gpu/cl/IClKernel.h b/src/core/gpu/cl/IClKernel.h
new file mode 100644
index 0000000000..52ea3c9183
--- /dev/null
+++ b/src/core/gpu/cl/IClKernel.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_KERNEL_H
+#define ARM_COMPUTE_ICL_KERNEL_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using IClKernel = arm_compute::ICLKernel;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_KERNEL_H */
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
index ccd6a5a0fc..c16ff1f028 100644
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -36,50 +36,54 @@
 
 namespace arm_compute
 {
+namespace opencl
+{
+namespace kernels
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimZ) != output->dimension(Window::DimZ));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) + batch_offset > output->dimension(3));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
 
     return Status{};
 }
 } // namespace
 
-CLBatchConcatenateLayerKernel::CLBatchConcatenateLayerKernel()
+ClBatchConcatenateKernel::ClBatchConcatenateKernel()
     : _batch_offset(0)
 {
 }
 
-void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, batch_offset, output));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({ src, dst });
 
     _batch_offset = batch_offset;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
-        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
@@ -91,12 +95,12 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
 
     // Configure kernel window
-    auto win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    win.set(3, Window::Dimension(0, input->tensor_shape()[3], 1));
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1));
     ICLKernel::configure_internal(win);
 
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_";
@@ -104,26 +108,26 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     _config_id += "_";
     _config_id += support::cpp11::to_string(batch_offset);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(0));
+    _config_id += support::cpp11::to_string(src->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(1));
+    _config_id += support::cpp11::to_string(src->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(2));
+    _config_id += support::cpp11::to_string(src->dimension(2));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->dimension(3));
+    _config_id += support::cpp11::to_string(src->dimension(3));
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
-                                               unsigned int                    batch_offset,
-                                               const arm_compute::ITensorInfo *output)
+Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+                                          unsigned int                    batch_offset,
+                                          const arm_compute::ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, batch_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
     return Status{};
 }
 
-void CLBatchConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -135,7 +139,7 @@ void CLBatchConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &w
 
     const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
 
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
     _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
 
     do
@@ -147,4 +151,6 @@ void CLBatchConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &w
     }
     while(window.slide_window_slice_3D(slice));
 }
+} // namespace opencl
+} // namespace kernels
 } // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
new file mode 100644
index 0000000000..378a08aa4f
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the batch concatenate kernel.
+ *  The src tensor will be concatenated into the destination tensor.
+ */
+class ClBatchConcatenateKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClBatchConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor. Data types supported: All.
+     * @param[in]     batch_offset    The offset on axis # 3.
+     * @param[in,out] dst             Destination tensor. Data types supported: Same as @p src.
+     *
+     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClBatchConcatenateKernel
+     *
+     * @param[in] src          Input tensor info. Data types supported: All.
+     * @param[in] batch_offset The offset on axis # 3.
+     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _batch_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
index eb5bfc2d86..e8893d76d2 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -36,49 +36,53 @@
 
 namespace arm_compute
 {
+namespace opencl
+{
+namespace kernels
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst);
 
     return Status{};
 }
 } // namespace
 
-CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
+ClDepthConcatenateKernel::ClDepthConcatenateKernel()
     : _depth_offset(0)
 {
 }
 
-void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, depth_offset, output));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({ src, dst });
 
     _depth_offset = depth_offset;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
-        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
@@ -90,25 +94,25 @@ void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
 
     // Configure kernel window
-    auto win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1));
     ICLKernel::configure_internal(win);
 
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
-                                               unsigned int                    depth_offset,
-                                               const arm_compute::ITensorInfo *output)
+Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+                                          unsigned int                    depth_offset,
+                                          const arm_compute::ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, depth_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
     return Status{};
 }
 
-void CLDepthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -120,7 +124,7 @@ void CLDepthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &w
 
     const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
 
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
     _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
 
     do
@@ -132,4 +136,6 @@ void CLDepthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &w
     }
     while(window.slide_window_slice_3D(slice));
 }
+} // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
new file mode 100644
index 0000000000..144d7d48f2
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the depth concatenate kernel.
+ *  The src tensor will be concatenated into the dst tensor.
+ */
+class ClDepthConcatenateKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    ClDepthConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]     depth_offset    The offset on the Z axis.
+     * @param[in,out] dst             Destination tensor. Data types supported: Same as @p src.
+     *
+     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClDepthConcatenateKernel
+     *
+     * @param[in] src          Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] depth_offset The offset on the Z axis.
+     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _depth_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
index 8aa7366d50..83e976e10f 100644
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -37,60 +37,64 @@
 
 namespace arm_compute
 {
+namespace opencl
+{
+namespace kernels
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) + height_offset > output->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
 
     return Status{};
 }
 } // namespace
 
-CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
+ClHeightConcatenateKernel::ClHeightConcatenateKernel()
     : _height_offset(0)
 {
 }
 
-Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
     return Status{};
 }
 
-void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output)
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({ src, dst });
 
     _height_offset = height_offset;
 
     // Add build options
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
 
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
-        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
@@ -102,17 +106,17 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c
     _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
     // Configure kernel window
 
-    // The window needs to be based on input as we copy all the heights of input
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    // The window needs to be based on src as we copy all the heights of src
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -125,4 +129,6 @@ void CLHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &
     add_4D_tensor_argument(idx, dst, window);
     enqueue(queue, *this, window, lws_hint());
 }
+} // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
new file mode 100644
index 0000000000..88cd4c4d17
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H
+#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the height concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class ClHeightConcatenateKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClHeightConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data types supported: All.
+     * @param[in]  height_offset   The starting offset on the Y axis for the dst tensor.
+     * @param[out] dst             Destination tensor. Data types supported: same as @p src.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClHeightConcatenateKernel
+     *
+     * @param[in] src           Source tensor info. Data types supported: All.
+     * @param[in] height_offset The starting offset on the Y axis for the dst tensor.
+     * @param[in] dst           Destination tensor info. Data types supported: same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _height_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
index d6697ba46b..6a2ab3b50f 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -37,62 +37,66 @@
 
 namespace arm_compute
 {
+namespace opencl
+{
+namespace kernels
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) > output->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
 
     for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
 
     return Status{};
 }
 } // namespace
 
-Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
     return Status{};
 }
 
-void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
 
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({ src1, src2, dst });
 
-    const unsigned int min_dimension                     = std::min(input1->dimension(0), input2->dimension(0));
+    const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = output->dimension(0) % num_elems_processed_per_iteration;
+    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2);
-    if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo)
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
+    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
-        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
+        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -106,27 +110,27 @@ void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile
     _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x2_";
-    _config_id += lower_string(string_from_data_type(input1->data_type()));
+    _config_id += lower_string(string_from_data_type(src1->data_type()));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(0));
+    _config_id += support::cpp11::to_string(src1->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(1));
+    _config_id += support::cpp11::to_string(src1->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(0));
+    _config_id += support::cpp11::to_string(src2->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(1));
+    _config_id += support::cpp11::to_string(src2->dimension(1));
 }
 
-void CLWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -147,4 +151,6 @@ void CLWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window
     }
     while(window.slide_window_slice_4D(slice));
 }
+} // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
new file mode 100644
index 0000000000..92715008cf
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 2 tensors.
+ *  The src1 and src2 tensors will be concatenated into the dst tensor.
+ */
+class ClWidthConcatenate2TensorsKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClWidthConcatenate2TensorsKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel);
+    /** Initialise the kernel's sources and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor. Data types supported: All.
+     * @param[in]  src2            Second source tensor. Data types supported: same as @p src1
+     * @param[out] dst             Destination tensor. Data types supported: Same as @p src1.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenate2TensorsKernel
+     *
+     * @param[in] src1 First tensor info. Data types supported: All.
+     * @param[in] src2 Second tensor info. Data types supported: same as @p src1
+     * @param[in] dst  Destination tensor info. Data types supported: Same as @p src1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
index 7ecdd30224..4b49652a73 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -37,76 +37,80 @@
 
 namespace arm_compute
 {
+namespace opencl
+{
+namespace kernels
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
 
     for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input3->dimension(i) != output->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(input4->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i));
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
 
     return Status{};
 }
 } // namespace
 
-CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
+ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
 {
 }
 
-Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
     return Status{};
 }
 
-void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *input1, ITensorInfo *input2,
-                                                 ITensorInfo *input3, ITensorInfo *input4,
-                                                 ITensorInfo *output)
+void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo *src1, ITensorInfo *src2,
+                                                 ITensorInfo *src3, ITensorInfo *src4,
+                                                 ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, input3, input4, output));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
 
-    auto               padding_info                      = get_padding_info({ input1, input2, input3, input4, output });
-    const unsigned int min_dimension                     = std::min(std::min(input1->dimension(0), input2->dimension(0)), std::min(input3->dimension(0), input4->dimension(0)));
+    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
+    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = output->dimension(0) % num_elems_processed_per_iteration;
+    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0)));
-    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->dimension(0)));
-    build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(input4->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) + input2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) + input2->dimension(0) + input3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-
-    // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2, input3, input4);
-    if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo)
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
+    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(src3->dimension(0)));
+    build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(src4->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+
+    // If soources have different quantization info set quantization parameters needed for the re-quantization process
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
-        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
-        const UniformQuantizationInfo iq3_info = input3->quantization_info().uniform();
-        const UniformQuantizationInfo iq4_info = input4->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
+        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+        const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform();
+        const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -124,35 +128,35 @@ void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x4_";
-    _config_id += lower_string(string_from_data_type(input1->data_type()));
+    _config_id += lower_string(string_from_data_type(src1->data_type()));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(0));
+    _config_id += support::cpp11::to_string(src1->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->dimension(1));
+    _config_id += support::cpp11::to_string(src1->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(0));
+    _config_id += support::cpp11::to_string(src2->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->dimension(1));
+    _config_id += support::cpp11::to_string(src2->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input3->dimension(0));
+    _config_id += support::cpp11::to_string(src3->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input3->dimension(1));
+    _config_id += support::cpp11::to_string(src3->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input4->dimension(0));
+    _config_id += support::cpp11::to_string(src4->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input4->dimension(1));
+    _config_id += support::cpp11::to_string(src4->dimension(1));
 }
 
-void CLWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -177,4 +181,6 @@ void CLWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window
     }
     while(window.slide_window_slice_4D(slice));
 }
+} // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
new file mode 100644
index 0000000000..06d6c0399a
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 4 tensors.
+ *  All source tensors will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenate4TensorsKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClWidthConcatenate4TensorsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel);
+    /** Initialise the kernel's sources and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor. Data types supported: All.
+     * @param[in]  src2            Second source tensor. Data types supported: same as @p src1
+     * @param[in]  src3            Third source tensor. Data types supported: same as @p src1
+     * @param[in]  src4            Fourth source tensor. Data types supported: same as @p src1
+     * @param[out] dst             Destination tensor. Data types supported: same as @p src1.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenate4TensorsKernel
+     *
+     * @param[in] src1 First tensor info. Data types supported: All.
+     * @param[in] src2 Second tensor info. Data types supported: same as @p src1
+     * @param[in] src3 Third tensor info. Data types supported: same as @p src1
+     * @param[in] src4 Fourth tensor info. Data types supported: same as @p src1
+     * @param[in] dst  Destination tensor info. Data types supported: same as @p src1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
index 30d0a481bd..8cbbc27444 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -36,58 +36,62 @@
 
 namespace arm_compute
 {
+namespace opencl
+{
+namespace kernels
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) + width_offset > output->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
     for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
 
     return Status{};
 }
 } // namespace
 
-CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel()
+ClWidthConcatenateKernel::ClWidthConcatenateKernel()
 {
 }
 
-Status CLWidthConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
+Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, width_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
     return Status{};
 }
 
-void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, width_offset, output));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({ src, dst });
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
 
-    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
+    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
-        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
@@ -98,16 +102,16 @@ void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     // Create kernel
     _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
     // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
-    // Set output valid region
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    // Set dst valid region
+    dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLWidthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -120,4 +124,6 @@ void CLWidthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &w
     add_4D_tensor_argument(idx, dst, window);
     enqueue(queue, *this, window, lws_hint());
 }
+} // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
new file mode 100644
index 0000000000..3bffe52700
--- /dev/null
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenateKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClWidthConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor. Data types supported: All.
+     * @param[in]     width_offset    The offset on the X axis.
+     * @param[in,out] dst             Destination tensor. Data types supported: same as @p src.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenateKernel
+     *
+     * @param[in] src          Source tensor info. Data types supported: All.
+     * @param[in] width_offset The offset on the X axis.
+     * @param[in] dst          Destination tensor info. Data types supported: same as @p src.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H */
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0c473a79c8..ea96e45bf8 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,242 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
-namespace experimental
-{
-CLConcatenation::CLConcatenation()
-    : _concat_kernels(),
-      _num_inputs(0),
-      _axis(Window::DimX)
-{
-}
-
-void CLConcatenation::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis)
-{
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-    _axis       = axis;
-    _num_inputs = inputs_vector.size();
-
-    TensorShape                      output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
-    std::vector<const ITensorInfo *> const_inputs_vector(inputs_vector.size());
-    std::transform(inputs_vector.begin(), inputs_vector.end(), const_inputs_vector.begin(), [](ITensorInfo * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t;
-    });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, inputs_vector[0]->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(const_inputs_vector, output, axis));
-
-    unsigned int offset = 0;
-    switch(_axis)
-    {
-        case Window::DimX:
-        {
-            switch(_num_inputs)
-            {
-                case 2:
-                {
-                    // Configure WidthConcatenate2Tensors kernel
-                    auto kernel = std::make_unique<CLWidthConcatenate2TensorsKernel>();
-                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), output);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                case 4:
-                {
-                    // Configure WidthConcatenate4Tensors kernel
-                    auto kernel = std::make_unique<CLWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                default:
-                {
-                    // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
-                    {
-                        auto kernel = std::make_unique<CLWidthConcatenateLayerKernel>();
-                        kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                        offset += inputs_vector.at(i)->dimension(_axis);
-                        _concat_kernels.emplace_back(std::move(kernel));
-                    }
-                    break;
-                }
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<CLHeightConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<CLDepthConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case 3:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<CLBatchConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-}
-
-Status CLConcatenation::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
-    const unsigned int num_inputs = inputs_vector.size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
-    unsigned int offset = 0;
-    switch(axis)
-    {
-        case Window::DimX:
-        {
-            switch(num_inputs)
-            {
-                case 2:
-                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output));
-                    break;
-                case 4:
-                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output));
-                    break;
-                default:
-                    // Validate generic case of WidthConcatenate kernel
-                    for(const auto &input : inputs_vector)
-                    {
-                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-                        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output));
-                        offset += input->dimension(axis);
-                    }
-                    break;
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        case 3:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLBatchConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-
-    if(output->total_size() != 0)
-    {
-        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-
-void CLConcatenation::run(ITensorPack &tensors)
-{
-    if(tensors.empty())
-    {
-        ARM_COMPUTE_ERROR("No inputs provided");
-    }
-
-    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
-    {
-        ARM_COMPUTE_ERROR("Configured with different number of inputs");
-    }
-
-    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
-    {
-        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
-        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
-    }
-    else
-    {
-        int i = 0;
-        for(auto &k : _concat_kernels)
-        {
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
-            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
-            CLScheduler::get().enqueue_op(*k, pack, true);
-            ++i;
-        }
-    }
-}
-} // namespace experimental
-
 struct CLConcatenateLayer::Impl
 {
-    std::vector<const ICLTensor *>                 srcs{};
-    ICLTensor                                     *dst{ nullptr };
-    unsigned int                                   num_inputs{ 0 };
-    unsigned int                                   axis{ 0 };
-    std::unique_ptr<experimental::CLConcatenation> op{ nullptr };
+    std::vector<const ICLTensor *>         srcs{};
+    ICLTensor                             *dst{ nullptr };
+    unsigned int                           num_inputs{ 0 };
+    unsigned int                           axis{ 0 };
+    std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
 };
 
 CLConcatenateLayer::CLConcatenateLayer()
@@ -285,7 +62,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
     _impl->dst        = output;
     _impl->axis       = axis;
     _impl->num_inputs = inputs_vector.size();
-    _impl->op         = std::make_unique<experimental::CLConcatenation>();
+    _impl->op         = std::make_unique<opencl::ClConcatenate>();
 
     std::vector<ITensorInfo *> inputs_vector_info;
     for(unsigned int i = 0; i < inputs_vector.size(); ++i)
@@ -298,7 +75,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
 
 Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
-    return experimental::CLConcatenation::validate(inputs_vector, output, axis);
+    return opencl::ClConcatenate::validate(inputs_vector, output, axis);
 }
 
 void CLConcatenateLayer::run()
diff --git a/src/runtime/gpu/cl/IClOperator.h b/src/runtime/gpu/cl/IClOperator.h
new file mode 100644
index 0000000000..049bf05dc1
--- /dev/null
+++ b/src/runtime/gpu/cl/IClOperator.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_OPERATOR_H
+#define ARM_COMPUTE_ICL_OPERATOR_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using IClOperator = experimental::ICLOperator;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_OPERATOR_H */
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.cpp b/src/runtime/gpu/cl/operators/ClConcatenate.cpp
new file mode 100644
index 0000000000..4385fcfaed
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClConcatenate.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+ClConcatenate::ClConcatenate()
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimX)
+{
+}
+
+void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_ERROR_ON(dst == nullptr);
+    _axis       = axis;
+    _num_inputs = src_vector.size();
+
+    TensorShape                      dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t;
+    });
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
+
+    unsigned int offset = 0;
+    switch(_axis)
+    {
+        case Window::DimX:
+        {
+            switch(_num_inputs)
+            {
+                case 2:
+                {
+                    // Configure WidthConcatenate2Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                case 4:
+                {
+                    // Configure WidthConcatenate4Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                default:
+                {
+                    // Configure generic case WidthConcatenate kernels
+                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    {
+                        auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
+                        kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                        offset += src_vector.at(i)->dimension(_axis);
+                        _concat_kernels.emplace_back(std::move(kernel));
+                    }
+                    break;
+                }
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case 3:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+
+Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr);
+    const unsigned int num_inputs = src_vector.size();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+    unsigned int offset = 0;
+    switch(axis)
+    {
+        case Window::DimX:
+        {
+            switch(num_inputs)
+            {
+                case 2:
+                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    break;
+                case 4:
+                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    break;
+                default:
+                    // Validate generic case of WidthConcatenate kernel
+                    for(const auto &src : src_vector)
+                    {
+                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+                        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
+                        offset += src->dimension(axis);
+                    }
+                    break;
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for(const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for(const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case 3:
+        {
+            for(const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+
+    if(dst->total_size() != 0)
+    {
+        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
+    }
+
+    return Status{};
+}
+
+void ClConcatenate::run(ITensorPack &tensors)
+{
+    if(tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    {
+        ARM_COMPUTE_ERROR("Configured with different number of inputs");
+    }
+
+    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    {
+        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
+        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
+    }
+    else
+    {
+        int i = 0;
+        for(auto &k : _concat_kernels)
+        {
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+            CLScheduler::get().enqueue_op(*k, pack, true);
+            ++i;
+        }
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.h b/src/runtime/gpu/cl/operators/ClConcatenate.h
new file mode 100644
index 0000000000..112e2ac6b7
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClConcatenate.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONCATENATE_H
+#define ARM_COMPUTE_CLCONCATENATE_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/IClKernel.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
+ */
+class ClConcatenate : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClConcatenate();
+    /** Initialise the kernel's inputs vector and dst.
+     *
+     * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
+     *       @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
+     *
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] src_vector      The vectors containing all the tensors to concatenate. Data types supported: All
+     * @param[out]    dst             Destination tensor. Data types supported: same as @p src_vector.
+     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     */
+    void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
+    /** Static function to check if given info will lead to a valid configuration of @ref ClConcatenate
+     *
+     * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
+     *       @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
+     *
+     * @param[in] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All
+     * @param[in] dst        Destination tensor info. Data types supported: same as @p src_vector.
+     * @param[in] axis       Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::vector<std::unique_ptr<IClKernel>> _concat_kernels;
+    unsigned int                            _num_inputs;
+    unsigned int                            _axis;
+};
+} // namespace opencl
+} // namespace arm_comPUTE
+#endif /* ARM_COMPUTE_CL_CONCATENATE_H */
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2021-01-18 21:15:59 +0000
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-01-20 16:28:27 +0000
commit	7d61ff041826782d14e67b7f5b7a2864905ff38b (patch)
tree	2e69c8a5fdabc6717b0691acdbbe7374d856902f
parent	da6a6eb3bc06ce8869ae3290853970d4c0ce412e (diff)
download	ComputeLibrary-7d61ff041826782d14e67b7f5b7a2864905ff38b.tar.gz