COMPMID-2109: Remove CL/NE Width/Depth ConcatenateLayer functions.

Change-Id: Icbda771abffbb45d4ed0958933c60ff9ace01314 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/1178 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2019-05-17 18:14:40 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2019-05-21 11:28:01 +0000
commit: 09f24975437e2e141ba51a07055a9372b0d173a2 (patch)
tree: fe565e4b9abd379cb1f467e5d9e36d68fcfbacef
parent: f24411ffc842970609a1fb6ba2f9527cfb681dbd (diff)
download: ComputeLibrary-09f24975437e2e141ba51a07055a9372b0d173a2.tar.gz
27 files changed, 96 insertions, 1089 deletions
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index e314f44370..fbaab35414 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -53,7 +53,6 @@
 #include "arm_compute/runtime/CL/functions/CLCropResize.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
@@ -143,7 +142,6 @@
 #include "arm_compute/runtime/CL/functions/CLUpsampleLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
 #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
 #include "arm_compute/runtime/CL/functions/CLYOLOLayer.h"
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index d85a4453d8..c56fc117b9 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -26,7 +26,7 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 #include <memory>
@@ -41,9 +41,9 @@ class Status;
 
 /** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
  *
- * -# @ref CLWidthConcatenateLayer (if underlying concatenation axis is 0).
+ * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
  * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref CLDepthConcatenateLayer (if underlying concatenation axis is 2).
+ * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
  */
 class CLConcatenateLayer : public IFunction
 {
@@ -53,7 +53,7 @@ public:
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayer, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayer.
+     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
@@ -63,7 +63,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayer, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayer.
+     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
      *
      * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h
deleted file mode 100644
index 9ef21f32d7..0000000000
--- a/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
-#define __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
- *
- * @deprecated This function is deprecated and will be removed in release 19.08
- *
- * -# @ref CLFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
- * -# @ref CLDepthConcatenateLayerKernel
- *
- */
-class CLDepthConcatenateLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    CLDepthConcatenateLayer();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
-     *                              Input dimensions might differ for each input for the first three dimensions (width, height, depth)
-     *                              and must match for the rest.
-     *                              Note that the difference between the minimum and maximum width and height among the input tensors
-     *                              must be divisible by 2 otherwise it is not clear how padding should be added on the inputs' width and
-     *                              height when they are less than the maximum input sizes.
-     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
-     *                              Output tensor dimensions match the inputs' ones from the fourth dimension and above,
-     *                              while width and height are the maximum width and height of the input tensors.
-     *                              Finally, depth is the sum of the input depths.
-     */
-    void configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayer
-     *
-     * @param[in] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
-     *                          Input dimensions might differ for each input for the first three dimensions (width, height, depth)
-     *                          and must match for the rest.
-     *                          Note that the difference between the minimum and maximum width and height among the input tensors
-     *                          must be divisible by 2 otherwise it is not clear how padding should be added on the inputs' width and
-     *                          height when they are less than the maximum input sizes.
-     * @param[in] output        Output tensor. Data types supported: Same as @p input.
-     *                          Output tensor dimensions match the inputs' ones from the fourth dimension and above,
-     *                          while width and height are the maximum width and height of the input tensors.
-     *                          Finally, depth is the sum of the input depths.
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::vector<CLDepthConcatenateLayerKernel> _concat_kernels_vector;
-    std::vector<CLFillBorderKernel>            _border_handlers_vector;
-    unsigned int                               _num_inputs;
-};
-}
-#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index 8bd47cbf8e..3add152878 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -35,10 +35,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
 
@@ -184,7 +184,7 @@ private:
     CLActivationLayerKernel              _projection_clip;
     CLCopyKernel                         _copy_cell_state;
     CLCopyKernel                         _copy_output;
-    CLWidthConcatenateLayer              _concat_scratch_buffer;
+    CLConcatenateLayer                   _concat_scratch_buffer;
     CLWidthConcatenate2TensorsKernel     _concat_inputs_forget_gate;
     CLWidthConcatenate2TensorsKernel     _concat_weights_forget_gate;
     CLWidthConcatenate2TensorsKernel     _concat_weights_input_gate;
diff --git a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
deleted file mode 100644
index 6a30fcfa92..0000000000
--- a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLWIDTHCONCATENATELAYER_H__
-#define __ARM_COMPUTE_CLWIDTHCONCATENATELAYER_H__
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute concatenate tensors along x axis. This function calls the following kernel:
- *
- * @deprecated This function is deprecated and will be removed in release 19.08
- *
- * -# @ref CLWidthConcatenateLayerKernel
- * -# @ref CLWidthConcatenate2TensorsKernel (if there are exactly 2 input tensors)
- * -# @ref CLWidthConcatenate4TensorsKernel (if there are exactly 4 input tensors)
- *
- */
-class CLWidthConcatenateLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenateLayer();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @param[in]  inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
-     *                           Dimensions of all the inputs should match apart for the width which can differ.
-     * @param[out] output        Output tensor. Data types supported: Same as @p input.
-     *                           Output tensor dimensions are the same with the inputs from the second dimension and above.
-     *                           The first dimension (width) is the sum of the input tensors' widths.
-     */
-    void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
-     *
-     * @param[in] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
-     *                          Dimensions of all the inputs should match apart for the width which can differ.
-     * @param[in] output        Output tensor. Data types supported: Same as @p input.
-     *                          Output tensor dimensions are the same with the inputs from the second dimension and above.
-     *                          The first dimension (width) is the sum of the input tensors' widths.
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::vector<CLWidthConcatenateLayerKernel> _concat_kernels_vector;
-    CLWidthConcatenate2TensorsKernel           _concat_x2_kernel;
-    CLWidthConcatenate4TensorsKernel           _concat_x4_kernel;
-    unsigned int                               _num_inputs;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLWIDTHCONCATENATELAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
index 7e01480801..67275303c9 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
@@ -31,7 +31,6 @@
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h"
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h
deleted file mode 100644
index da00f387e9..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_GCDEPTHCONCATENATE_H__
-#define __ARM_COMPUTE_GCDEPTHCONCATENATE_H__
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
- *
- * @deprecated This function is deprecated and will be removed in release 19.08
- * -# @ref GCFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
- * -# @ref GCDepthConcatenateLayerKernel
- *
- */
-class GCDepthConcatenateLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    GCDepthConcatenateLayer();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: F16/F32.
-     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
-     */
-    void configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::vector<std::unique_ptr<GCDepthConcatenateLayerKernel>> _concat_kernels_vector;
-    std::vector<std::unique_ptr<GCFillBorderKernel>>            _border_handlers_vector;
-    unsigned int                                                _num_inputs;
-};
-}
-#endif /* __ARM_COMPUTE_GCDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index d84422f882..0d94ea78fc 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -51,7 +51,6 @@
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
 #include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h"
@@ -142,7 +141,6 @@
 #include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
 #include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
-#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
 
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index f8cda326d2..8c97efc4f0 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -26,8 +26,9 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Requires.h"
 
 #include <memory>
 #include <vector>
@@ -41,9 +42,9 @@ class Status;
 
 /** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
  *
- * -# @ref NEWidthConcatenateLayer (if underlying concatenation axis is 0).
+ * -# @ref NEWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
  * -# @ref NEHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref NEDepthConcatenateLayer (if underlying concatenation axis is 2).
+ * -# @ref NEDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
  */
 class NEConcatenateLayer : public IFunction
 {
@@ -53,17 +54,18 @@ public:
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayer, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayer.
+     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayerKernel, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayerKernel.
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1 and 2.
      */
-    void configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, size_t axis);
+    void configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis);
+    void configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayer, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayer.
+     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayerKernel, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayerKernel.
      *
      * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
@@ -72,11 +74,19 @@ public:
      * @return a status
      */
     static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
+    template <typename TensorType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorType>::type, ITensor>::value)>
+    void configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis);
+
+    template <typename TensorInfoType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorInfoType>::type, ITensorInfo>::value)>
+    static Status validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis);
+
+private:
     std::vector<std::unique_ptr<INEKernel>> _concat_kernels;
     unsigned int                            _num_inputs;
     unsigned int                            _axis;
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h
deleted file mode 100644
index b3bf752b40..0000000000
--- a/arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
-#define __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
- * -# @ref NEDepthConcatenateLayerKernel
- *
- * @deprecated This function is deprecated and will be removed in release 19.08
- *
- */
-class NEDepthConcatenateLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    NEDepthConcatenateLayer();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
-     *                              Input dimensions might differ for each input for the first three dimensions (width, height, depth)
-     *                              and must match for the rest.
-     *                              Note that the difference between the minimum and maximum width and height among the input tensors
-     *                              must be divisible by 2 otherwise it is not clear how padding should be added on the inputs' width and
-     *                              height when they are less than the maximum input sizes.
-     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
-     *                              Output tensor dimensions match the inputs' ones from the fourth dimension and above,
-     *                              while width and height are the maximum width and height of the input tensors.
-     *                              Finally, depth is the sum of the input depths.
-     */
-    void configure(const std::vector<ITensor *> &inputs_vector, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthConcatenateLayer
-     *
-     * @param[in] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
-     *                          Input dimensions might differ for each input for the first three dimensions (width, height, depth)
-     *                          and must match for the rest.
-     *                          Note that the difference between the minimum and maximum width and height among the input tensors
-     *                          must be divisible by 2 otherwise it is not clear how padding should be added on the inputs' width and
-     *                          height when they are less than the maximum input sizes.
-     * @param[in] output        Output tensor. Data types supported: Same as @p input.
-     *                          Output tensor dimensions match the inputs' ones from the fourth dimension and above,
-     *                          while width and height are the maximum width and height of the input tensors.
-     *                          Finally, depth is the sum of the input depths.
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::vector<ITensor *>                                      _inputs_vector;
-    std::vector<std::unique_ptr<NEDepthConcatenateLayerKernel>> _concat_kernels_vector;
-    std::vector<std::unique_ptr<NEFillBorderKernel>>            _border_handlers_vector;
-    unsigned int                                                _num_inputs;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index f3a1aa7c75..cf0f06c215 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -32,9 +32,9 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -176,11 +176,11 @@ private:
     NEActivationLayerKernel         _projection_clip;
     NECopyKernel                    _copy_cell_state;
     NECopyKernel                    _copy_output;
-    NEWidthConcatenateLayer         _concat_scratch_buffer;
-    NEWidthConcatenateLayer         _concat_inputs_forget_gate;
-    NEWidthConcatenateLayer         _concat_weights_forget_gate;
-    NEWidthConcatenateLayer         _concat_weights_input_gate;
-    NEWidthConcatenateLayer         _concat_weights_output;
+    NEConcatenateLayer              _concat_scratch_buffer;
+    NEConcatenateLayer              _concat_inputs_forget_gate;
+    NEConcatenateLayer              _concat_weights_forget_gate;
+    NEConcatenateLayer              _concat_weights_input_gate;
+    NEConcatenateLayer              _concat_weights_output;
     Tensor                          _input_gate_out1;
     Tensor                          _input_gate_out2;
     Tensor                          _input_gate_out3;
diff --git a/arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h
deleted file mode 100644
index 8d221766cd..0000000000
--- a/arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEWIDTHCONCATENATELAYER_H__
-#define __ARM_COMPUTE_NEWIDTHCONCATENATELAYER_H__
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
-
-#include "arm_compute/core/utils/misc/Requires.h"
-
-#include <memory>
-#include <type_traits>
-#include <vector>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to execute concatenate tensors along x axis. This function calls the following kernel:
- *
- * -# @ref NEWidthConcatenateLayerKernel
- *
- * @deprecated This function is deprecated and will be removed in release 19.08
- */
-class NEWidthConcatenateLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    NEWidthConcatenateLayer();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @param[in]  inputs_vector The vectors containing all the tensors to concatenate. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     *                           Dimensions of all the inputs should match apart for the width which can differ.
-     * @param[out] output        Output tensor. Data types supported: Same as @p input.
-     *                           Output tensor dimensions are the same with the inputs from the second dimension and above.
-     *                           The first dimension (width) is the sum of the input tensors' widths.
-     */
-    void configure(std::vector<ITensor *> inputs_vector, ITensor *output);
-    void configure(std::vector<const ITensor *> inputs_vector, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWidthConcatenateLayer
-     *
-     * @param[in] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     *                          Dimensions of all the inputs should match apart for the width which can differ.
-     * @param[in] output        Output tensor. Data types supported: Same as @p input.
-     *                          Output tensor dimensions are the same with the inputs from the second dimension and above.
-     *                          The first dimension (width) is the sum of the input tensors' widths.
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output);
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::vector<NEWidthConcatenateLayerKernel> _concat_kernels_vector;
-    unsigned int                               _num_inputs;
-    template <typename TensorType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorType>::type, ITensor>::value)>
-    void configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output);
-    template <typename TensorInfoType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorInfoType>::type, ITensorInfo>::value)>
-    static Status validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEWIDTHCONCATENATELAYER_H__ */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 03e889d14a..caf7ee77bc 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -267,11 +267,11 @@ v19.05 Public major release
  - New OpenGLES kernels / functions:
     - @ref GCConcatenateLayer
  - Deprecated functions/interfaces
-    - @ref GCDepthConcatenateLayer
-    - @ref NEWidthConcatenateLayer
-    - @ref NEDepthConcatenateLayer
-    - @ref CLWidthConcatenateLayer
-    - @ref CLDepthConcatenateLayer
+    - GCDepthConcatenateLayer
+    - NEWidthConcatenateLayer
+    - NEDepthConcatenateLayer
+    - CLWidthConcatenateLayer
+    - CLDepthConcatenateLayer
     - CLGEMMInterleave4x4
     - CLGEMMTranspose1xW
  - Support different quantization info in CLConcatLayer.
@@ -424,7 +424,7 @@ v18.11 Public major release
  - Added documentation for add a new function or kernel.
  - Improved doxygen documentation adding a list of the existing functions.
  - Add 4D tensors support to
-    - @ref CLWidthConcatenateLayer
+    - CLWidthConcatenateLayer
     - @ref CLFlattenLayer
     - @ref CLSoftmaxLayer
  - Add dot product support for @ref CLDepthwiseConvolutionLayer3x3NHWCKernel non-unit stride
@@ -453,7 +453,7 @@ v18.08 Public major release
  - Removed support for QS8/QS16 data types.
  - Added support for grouped convolution in @ref CLConvolutionLayer.
  - Added NHWC data layout support to:
-    - @ref NEDepthConcatenateLayer / @ref CLDepthConcatenateLayer
+    - NEDepthConcatenateLayer / CLDepthConcatenateLayer
     - @ref NEWinogradConvolutionLayer / @ref CLWinogradConvolutionLayer
     - @ref CLDepthwiseConvolutionLayer
     - @ref CLDirectConvolutionLayer
@@ -496,7 +496,7 @@ v18.05 Public major release
     - @ref CLCopy / @ref CLCopyKernel
     - @ref CLLSTMLayer
     - @ref CLRNNLayer
-    - @ref CLWidthConcatenateLayer / @ref CLWidthConcatenateLayerKernel
+    - CLWidthConcatenateLayer / @ref CLWidthConcatenateLayerKernel
     - @ref CLWinogradFilterTransformKernel / @ref CLWinogradInputTransformKernel / @ref CLWinogradConvolutionLayer
     - @ref CLWinogradInputTransformKernel / @ref CLWinogradInputTransform
  - New Neon kernels / functions:
@@ -619,7 +619,7 @@ v17.12 Public major release
     - @ref GCActivationLayerKernel / @ref GCActivationLayer
     - @ref GCBatchNormalizationLayerKernel / @ref GCBatchNormalizationLayer
     - @ref GCCol2ImKernel
-    - @ref GCDepthConcatenateLayerKernel / @ref GCDepthConcatenateLayer
+    - @ref GCDepthConcatenateLayerKernel / GCDepthConcatenateLayer
     - @ref GCDirectConvolutionLayerKernel / @ref GCDirectConvolutionLayer
     - @ref GCDropoutLayerKernel / @ref GCDropoutLayer
     - @ref GCFillBorderKernel / @ref GCFillBorder
@@ -707,7 +707,7 @@ v17.06 Public major release
  - User can specify his own scheduler by implementing the @ref IScheduler interface.
  - New OpenCL kernels / functions:
     - @ref CLBatchNormalizationLayerKernel / @ref CLBatchNormalizationLayer
-    - @ref CLDepthConcatenateLayerKernel / @ref CLDepthConcatenateLayer
+    - @ref CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
     - @ref CLHOGOrientationBinningKernel @ref CLHOGBlockNormalizationKernel, @ref CLHOGDetectorKernel / @ref CLHOGDescriptor @ref CLHOGDetector @ref CLHOGGradient @ref CLHOGMultiDetection
     - @ref CLLocallyConnectedMatrixMultiplyKernel / @ref CLLocallyConnectedLayer
     - @ref CLWeightsReshapeKernel / @ref CLConvolutionLayerReshapeWeights
@@ -715,7 +715,7 @@ v17.06 Public major release
     - @ref CPPDetectionWindowNonMaximaSuppressionKernel
  - New NEON kernels / functions:
     - @ref NEBatchNormalizationLayerKernel / @ref NEBatchNormalizationLayer
-    - @ref NEDepthConcatenateLayerKernel / @ref NEDepthConcatenateLayer
+    - @ref NEDepthConcatenateLayerKernel / NEDepthConcatenateLayer
     - @ref NEDirectConvolutionLayerKernel / @ref NEDirectConvolutionLayer
     - @ref NELocallyConnectedMatrixMultiplyKernel / @ref NELocallyConnectedLayer
     - @ref NEWeightsReshapeKernel / @ref NEConvolutionLayerReshapeWeights
diff --git a/docs/05_functions_list.dox b/docs/05_functions_list.dox
index 9a5c8c0027..999b573674 100644
--- a/docs/05_functions_list.dox
+++ b/docs/05_functions_list.dox
@@ -112,7 +112,6 @@ namespace arm_compute
     - @ref NEConvolutionSquare &lt;matrix_size&gt;
     - @ref NECropResize
     - @ref NEDeconvolutionLayer
-    - @ref NEDepthConcatenateLayer
     - @ref NEDepthwiseConvolutionAssemblyDispatch
     - @ref NEDepthwiseConvolutionLayer
     - @ref NEDepthwiseConvolutionLayer3x3
@@ -171,7 +170,6 @@ namespace arm_compute
     - @ref NEStackLayer
     - @ref NEUnstack
     - @ref NEUpsampleLayer
-    - @ref NEWidthConcatenateLayer
     - @ref NEWinogradConvolutionLayer
 
 @section S5_2 OpenCL functions
@@ -188,7 +186,6 @@ namespace arm_compute
     - @ref CLCropResize
     - @ref CLDeconvolutionLayer
     - @ref CLDeconvolutionLayerUpsample
-    - @ref CLDepthConcatenateLayer
     - @ref CLDepthwiseConvolutionLayer
     - @ref CLDepthwiseConvolutionLayer3x3
     - @ref CLDepthwiseSeparableConvolutionLayer
@@ -241,7 +238,6 @@ namespace arm_compute
     - @ref CLStackLayer
     - @ref CLUnstack
     - @ref CLUpsampleLayer
-    - @ref CLWidthConcatenateLayer
     - @ref CLWinogradConvolutionLayer
     - @ref ICLSimpleFunction
         - @ref CLAbsoluteDifference
@@ -327,7 +323,6 @@ namespace arm_compute
     - @ref GCConcatenateLayer
     - @ref GCConvolutionLayer
     - @ref GCConvolutionLayerReshapeWeights
-    - @ref GCDepthConcatenateLayer
     - @ref GCDepthwiseConvolutionLayer3x3
     - @ref GCDirectConvolutionLayer
     - @ref GCDropoutLayer
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 0de58f5c28..13543dbf15 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,43 +68,6 @@ struct GCEltwiseFunctions
 
 namespace detail
 {
-// Specialize functions
-template <>
-std::unique_ptr<IFunction> create_concatenate_layer<GCDepthConcatenateLayer, GCTargetInfo>(ConcatenateLayerNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
-
-    // Return nullptr if depth concatenate is switched off
-    if(!node.is_enabled())
-    {
-        return nullptr;
-    }
-
-    // Extract IO and info
-    std::vector<GCTargetInfo::TensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
-    {
-        inputs.push_back(get_backing_tensor<GCTargetInfo>(node.input(i)));
-    }
-    typename GCTargetInfo::TensorType *output = get_backing_tensor<GCTargetInfo>(node.output(0));
-
-    // Create and configure function
-    auto func = support::cpp14::make_unique<GCDepthConcatenateLayer>();
-    func->configure(inputs, output);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Target " << GCTargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << std::endl);
-
-    return std::move(func);
-}
-
 template <>
 std::unique_ptr<IFunction> create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(ConvolutionLayerNode &node, GraphContext &ctx)
 {
@@ -282,7 +245,7 @@ std::unique_ptr<IFunction> GCFunctionFactory::create(INode *node, GraphContext &
         case NodeType::ConvolutionLayer:
             return detail::create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<GCDepthConcatenateLayer, GCTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<GCConcatenateLayer, GCTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<GCDepthwiseConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::EltwiseLayer:
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index b8224d2cce..0594a17a7a 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
deleted file mode 100644
index f687e54552..0000000000
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
-    : _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void CLDepthConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-
-    _concat_kernels_vector.resize(_num_inputs);
-    _border_handlers_vector.resize(_num_inputs);
-
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    unsigned int depth_offset = 0;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-
-    // Set valid region from shape
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
-}
-
-Status CLDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    unsigned int depth_offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
-        depth_offset += input->dimension(2);
-    }
-
-    return Status{};
-}
-
-void CLDepthConcatenateLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    for(unsigned i = 0; i < _num_inputs; i++)
-    {
-        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
-    }
-}
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 4606a66bf2..85a81a8cd4 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -316,7 +316,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
-    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
     input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
@@ -497,7 +497,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     inputs_vector_info_raw.push_back(&forget_gate);
     inputs_vector_info_raw.push_back(&output_gate_tmp);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer, Window::DimX));
     return Status{};
 }
 
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
deleted file mode 100644
index a8667c3138..0000000000
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
-    : _concat_kernels_vector(),
-      _concat_x2_kernel(),
-      _concat_x4_kernel(),
-      _num_inputs(0)
-{
-}
-
-Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
-{
-    const unsigned int num_inputs = inputs_vector.size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo        tmp_output_info = *output->clone();
-    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    switch(num_inputs)
-    {
-        case 2:
-            // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
-            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
-            break;
-        case 4:
-            // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
-            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
-            break;
-        default:
-            unsigned int width_offset = 0;
-            // Validate generic case of WidthConcatenate kernel
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-                ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-                width_offset += input->dimension(0);
-            }
-            break;
-    }
-
-    return Status{};
-}
-
-void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-
-    ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    switch(_num_inputs)
-    {
-        case 2:
-            // Configure WidthConcatenate2Tensors kernel
-            _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
-            break;
-        case 4:
-            // Configure WidthConcatenate4Tensors kernel
-            _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
-            break;
-        default:
-            // Configure generic case WidthConcatenate kernels
-            _concat_kernels_vector.resize(_num_inputs);
-
-            unsigned int width_offset = 0;
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-                width_offset += inputs_vector.at(i)->info()->dimension(0);
-            }
-            break;
-    }
-}
-
-void CLWidthConcatenateLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    switch(_num_inputs)
-    {
-        case 2:
-            CLScheduler::get().enqueue(_concat_x2_kernel, true);
-            break;
-        case 4:
-            CLScheduler::get().enqueue(_concat_x4_kernel, true);
-            break;
-        default:
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
-            }
-            break;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
deleted file mode 100755
index b89aafa2e5..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-GCDepthConcatenateLayer::GCDepthConcatenateLayer() //NOLINT
-    : _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void GCDepthConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output) //NOLINT
-{
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
-    _num_inputs = inputs_vector.size();
-
-    unsigned int depth_offset = 0;
-
-    _concat_kernels_vector.reserve(_num_inputs);
-    _border_handlers_vector.reserve(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        auto concat_kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
-        auto border_kernel = support::cpp14::make_unique<GCFillBorderKernel>();
-
-        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
-        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
-        _border_handlers_vector.emplace_back(std::move(border_kernel));
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-}
-
-void GCDepthConcatenateLayer::run()
-{
-    for(unsigned i = 0; i < _num_inputs; i++)
-    {
-        GCScheduler::get().dispatch(*_border_handlers_vector[i].get(), false);
-        GCScheduler::get().memory_barrier();
-        GCScheduler::get().dispatch(*_concat_kernels_vector[i].get(), true);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 71af560fb0..d338493e51 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -44,7 +45,28 @@ NEConcatenateLayer::NEConcatenateLayer()
 {
 }
 
-void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, size_t axis)
+void NEConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+template <typename TensorType, typename>
+void NEConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     _axis       = axis;
@@ -97,7 +119,8 @@ void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector,
     }
 }
 
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+template <typename TensorInfoType, typename>
+Status NEConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
deleted file mode 100644
index 8f070a2d7d..0000000000
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEDepthConcatenateLayer::NEDepthConcatenateLayer() // NOLINT
-    : _inputs_vector(),
-      _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    unsigned int depth_offset = 0;
-    _concat_kernels_vector.reserve(_num_inputs);
-    _border_handlers_vector.reserve(_num_inputs);
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        auto concat_kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
-        auto border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
-        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
-        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-        _border_handlers_vector.emplace_back(std::move(border_kernel));
-        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-
-    // Set valid region from shape
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
-}
-
-Status NEDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    unsigned int depth_offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
-        depth_offset += input->dimension(2);
-    }
-
-    return Status{};
-}
-
-void NEDepthConcatenateLayer::run()
-{
-    for(unsigned i = 0; i < _num_inputs; ++i)
-    {
-        NEScheduler::get().schedule(_border_handlers_vector[i].get(), Window::DimX);
-        NEScheduler::get().schedule(_concat_kernels_vector[i].get(), Window::DimX);
-    }
-}
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 3d3c6a12fa..42b805794b 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -107,14 +107,14 @@ void NELSTMLayer::configure(const ITensor *input,
     inputs_vector.emplace_back(output_state_in);
 
     _memory_group.manage(&_forget_gate_out2);
-    _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2);
+    _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2, Window::DimX);
 
     std::vector<const ITensor *> weights_vector;
 
     weights_vector.emplace_back(input_to_forget_weights);
     weights_vector.emplace_back(recurrent_to_forget_weights);
 
-    _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6);
+    _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
     _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
@@ -165,7 +165,7 @@ void NELSTMLayer::configure(const ITensor *input,
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
 
-        _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2);
+        _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2, Window::DimX);
 
         _memory_group.manage(&_input_gate_out1);
         _memory_group.manage(&_input_gate_out4);
@@ -234,7 +234,7 @@ void NELSTMLayer::configure(const ITensor *input,
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
 
-    _concat_weights_output.configure(in_out_weights, &_output2);
+    _concat_weights_output.configure(in_out_weights, &_output2, Window::DimX);
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
@@ -308,7 +308,7 @@ void NELSTMLayer::configure(const ITensor *input,
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
-    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
     input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
@@ -383,8 +383,9 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    TensorInfo forget_gate_concat;
-    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, &forget_gate_concat));
+    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
@@ -409,8 +410,9 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorInfo lstm_gate_concat;
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(lstm_weights, &lstm_gate_concat));
+        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
 
         if(lstm_params.has_peephole_opt())
@@ -445,8 +447,9 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorInfo in_out_gate_concat;
-    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(in_out_weights, &in_out_gate_concat));
+    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
 
@@ -485,7 +488,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     inputs_vector_info_raw.push_back(&forget_gate);
     inputs_vector_info_raw.push_back(&output_gate_tmp);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer, Window::DimX));
     return Status{};
 }
 
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
deleted file mode 100644
index 25b5216305..0000000000
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEWidthConcatenateLayer::NEWidthConcatenateLayer()
-    : _concat_kernels_vector(),
-      _num_inputs(0)
-{
-}
-
-template <typename TensorInfoType, typename>
-inline Status NEWidthConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    unsigned int width_offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-        width_offset += input->dimension(0);
-    }
-
-    return Status{};
-}
-template <typename TensorType, typename>
-inline void NEWidthConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output)
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(NEWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    unsigned int width_offset = 0;
-
-    _concat_kernels_vector.resize(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-        width_offset += inputs_vector.at(i)->info()->dimension(0);
-    }
-}
-
-void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
-{
-    configure_internal(std::move(inputs_vector), output);
-}
-
-void NEWidthConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output)
-{
-    configure_internal(std::move(inputs_vector), output);
-}
-
-Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    return validate_internal(inputs_vector, output);
-}
-
-Status NEWidthConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    return validate_internal(inputs_vector, output);
-}
-
-void NEWidthConcatenateLayer::run()
-{
-    for(unsigned i = 0; i < _num_inputs; ++i)
-    {
-        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimY);
-    }
-}
diff --git a/tests/benchmark/CL/DepthConcatenateLayer.cpp b/tests/benchmark/CL/DepthConcatenateLayer.cpp
index 3a5c457135..9b101d84ed 100644
--- a/tests/benchmark/CL/DepthConcatenateLayer.cpp
+++ b/tests/benchmark/CL/DepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/benchmark/fixtures/DepthConcatenateLayerFixture.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -44,7 +44,7 @@ namespace
 const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
 } // namespace
 
-using CLDepthConcatenateLayerFixture = DepthConcatenateLayerFixture<CLTensor, ICLTensor, CLDepthConcatenateLayer, CLAccessor>;
+using CLDepthConcatenateLayerFixture = DepthConcatenateLayerFixture<CLTensor, ICLTensor, CLConcatenateLayer, CLAccessor>;
 
 TEST_SUITE(CL)
 TEST_SUITE(DepthConcatenateLayer)
diff --git a/tests/benchmark/NEON/DepthConcatenateLayer.cpp b/tests/benchmark/NEON/DepthConcatenateLayer.cpp
index b82da24999..1d8b18c2bb 100644
--- a/tests/benchmark/NEON/DepthConcatenateLayer.cpp
+++ b/tests/benchmark/NEON/DepthConcatenateLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
@@ -44,7 +44,7 @@ namespace
 const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
 } // namespace
 
-using NEDepthConcatenateLayerFixture = DepthConcatenateLayerFixture<Tensor, ITensor, NEDepthConcatenateLayer, Accessor>;
+using NEDepthConcatenateLayerFixture = DepthConcatenateLayerFixture<Tensor, ITensor, NEConcatenateLayer, Accessor>;
 
 TEST_SUITE(NEON)
 TEST_SUITE(DepthConcatenateLayer)
diff --git a/tests/benchmark/fixtures/DepthConcatenateLayerFixture.h b/tests/benchmark/fixtures/DepthConcatenateLayerFixture.h
index 541dfb285c..272da38952 100644
--- a/tests/benchmark/fixtures/DepthConcatenateLayerFixture.h
+++ b/tests/benchmark/fixtures/DepthConcatenateLayerFixture.h
@@ -86,7 +86,7 @@ public:
         TensorShape dst_shape = misc::shape_calculator::calculate_concatenate_shape(src_ptrs, Window::DimZ);
         _dst                  = create_tensor<TensorType>(dst_shape, data_type, 1);
 
-        _depth_concat.configure(src_ptrs, &_dst);
+        _depth_concat.configure(src_ptrs, &_dst, 2);
 
         for(auto &src : _srcs)
         {
author	Georgios Pinitas <georgios.pinitas@arm.com>	2019-05-17 18:14:40 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2019-05-21 11:28:01 +0000
commit	09f24975437e2e141ba51a07055a9372b0d173a2 (patch)
tree	fe565e4b9abd379cb1f467e5d9e36d68fcfbacef
parent	f24411ffc842970609a1fb6ba2f9527cfb681dbd (diff)
download	ComputeLibrary-09f24975437e2e141ba51a07055a9372b0d173a2.tar.gz