From 09f24975437e2e141ba51a07055a9372b0d173a2 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 17 May 2019 18:14:40 +0100
Subject: COMPMID-2109: Remove CL/NE Width/Depth ConcatenateLayer functions.

Change-Id: Icbda771abffbb45d4ed0958933c60ff9ace01314
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1178
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/graph/backends/GLES/GCFunctionsFactory.cpp     |  41 +-----
 src/runtime/CL/functions/CLConcatenateLayer.cpp    |   6 +-
 .../CL/functions/CLDepthConcatenateLayer.cpp       | 107 ---------------
 src/runtime/CL/functions/CLLSTMLayer.cpp           |   4 +-
 .../CL/functions/CLWidthConcatenateLayer.cpp       | 143 ---------------------
 .../functions/GCDepthConcatenateLayer.cpp          |  75 -----------
 src/runtime/NEON/functions/NEConcatenateLayer.cpp  |  31 ++++-
 .../NEON/functions/NEDepthConcatenateLayer.cpp     | 108 ----------------
 src/runtime/NEON/functions/NELSTMLayer.cpp         |  27 ++--
 .../NEON/functions/NEWidthConcatenateLayer.cpp     | 117 -----------------
 10 files changed, 50 insertions(+), 609 deletions(-)
 delete mode 100644 src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
 delete mode 100644 src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
 delete mode 100755 src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
 delete mode 100644 src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
 delete mode 100644 src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp

(limited to 'src')
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 0de58f5c28..13543dbf15 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,43 +68,6 @@ struct GCEltwiseFunctions
 
 namespace detail
 {
-// Specialize functions
-template <>
-std::unique_ptr<IFunction> create_concatenate_layer<GCDepthConcatenateLayer, GCTargetInfo>(ConcatenateLayerNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
-
-    // Return nullptr if depth concatenate is switched off
-    if(!node.is_enabled())
-    {
-        return nullptr;
-    }
-
-    // Extract IO and info
-    std::vector<GCTargetInfo::TensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
-    {
-        inputs.push_back(get_backing_tensor<GCTargetInfo>(node.input(i)));
-    }
-    typename GCTargetInfo::TensorType *output = get_backing_tensor<GCTargetInfo>(node.output(0));
-
-    // Create and configure function
-    auto func = support::cpp14::make_unique<GCDepthConcatenateLayer>();
-    func->configure(inputs, output);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Target " << GCTargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << std::endl);
-
-    return std::move(func);
-}
-
 template <>
 std::unique_ptr<IFunction> create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(ConvolutionLayerNode &node, GraphContext &ctx)
 {
@@ -282,7 +245,7 @@ std::unique_ptr<IFunction> GCFunctionFactory::create(INode *node, GraphContext &
         case NodeType::ConvolutionLayer:
             return detail::create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<GCDepthConcatenateLayer, GCTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<GCConcatenateLayer, GCTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<GCDepthwiseConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::EltwiseLayer:
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index b8224d2cce..0594a17a7a 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
deleted file mode 100644
index f687e54552..0000000000
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
-    : _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void CLDepthConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-
-    _concat_kernels_vector.resize(_num_inputs);
-    _border_handlers_vector.resize(_num_inputs);
-
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    unsigned int depth_offset = 0;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-
-    // Set valid region from shape
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
-}
-
-Status CLDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    unsigned int depth_offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
-        depth_offset += input->dimension(2);
-    }
-
-    return Status{};
-}
-
-void CLDepthConcatenateLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    for(unsigned i = 0; i < _num_inputs; i++)
-    {
-        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
-    }
-}
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 4606a66bf2..85a81a8cd4 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -316,7 +316,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
-    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
     input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
@@ -497,7 +497,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     inputs_vector_info_raw.push_back(&forget_gate);
     inputs_vector_info_raw.push_back(&output_gate_tmp);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer, Window::DimX));
     return Status{};
 }
 
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
deleted file mode 100644
index a8667c3138..0000000000
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
-    : _concat_kernels_vector(),
-      _concat_x2_kernel(),
-      _concat_x4_kernel(),
-      _num_inputs(0)
-{
-}
-
-Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
-{
-    const unsigned int num_inputs = inputs_vector.size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo        tmp_output_info = *output->clone();
-    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    switch(num_inputs)
-    {
-        case 2:
-            // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
-            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
-            break;
-        case 4:
-            // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
-            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
-            break;
-        default:
-            unsigned int width_offset = 0;
-            // Validate generic case of WidthConcatenate kernel
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-                ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-                width_offset += input->dimension(0);
-            }
-            break;
-    }
-
-    return Status{};
-}
-
-void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-
-    ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    switch(_num_inputs)
-    {
-        case 2:
-            // Configure WidthConcatenate2Tensors kernel
-            _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
-            break;
-        case 4:
-            // Configure WidthConcatenate4Tensors kernel
-            _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
-            break;
-        default:
-            // Configure generic case WidthConcatenate kernels
-            _concat_kernels_vector.resize(_num_inputs);
-
-            unsigned int width_offset = 0;
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-                width_offset += inputs_vector.at(i)->info()->dimension(0);
-            }
-            break;
-    }
-}
-
-void CLWidthConcatenateLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    switch(_num_inputs)
-    {
-        case 2:
-            CLScheduler::get().enqueue(_concat_x2_kernel, true);
-            break;
-        case 4:
-            CLScheduler::get().enqueue(_concat_x4_kernel, true);
-            break;
-        default:
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
-            }
-            break;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
deleted file mode 100755
index b89aafa2e5..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-GCDepthConcatenateLayer::GCDepthConcatenateLayer() //NOLINT
-    : _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void GCDepthConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output) //NOLINT
-{
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
-    _num_inputs = inputs_vector.size();
-
-    unsigned int depth_offset = 0;
-
-    _concat_kernels_vector.reserve(_num_inputs);
-    _border_handlers_vector.reserve(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        auto concat_kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
-        auto border_kernel = support::cpp14::make_unique<GCFillBorderKernel>();
-
-        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
-        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
-        _border_handlers_vector.emplace_back(std::move(border_kernel));
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-}
-
-void GCDepthConcatenateLayer::run()
-{
-    for(unsigned i = 0; i < _num_inputs; i++)
-    {
-        GCScheduler::get().dispatch(*_border_handlers_vector[i].get(), false);
-        GCScheduler::get().memory_barrier();
-        GCScheduler::get().dispatch(*_concat_kernels_vector[i].get(), true);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 71af560fb0..d338493e51 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -44,7 +45,28 @@ NEConcatenateLayer::NEConcatenateLayer()
 {
 }
 
-void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, size_t axis)
+void NEConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+template <typename TensorType, typename>
+void NEConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     _axis       = axis;
@@ -97,7 +119,8 @@ void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector,
     }
 }
 
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+template <typename TensorInfoType, typename>
+Status NEConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
deleted file mode 100644
index 8f070a2d7d..0000000000
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEDepthConcatenateLayer::NEDepthConcatenateLayer() // NOLINT
-    : _inputs_vector(),
-      _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    unsigned int depth_offset = 0;
-    _concat_kernels_vector.reserve(_num_inputs);
-    _border_handlers_vector.reserve(_num_inputs);
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        auto concat_kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
-        auto border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
-        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
-        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-        _border_handlers_vector.emplace_back(std::move(border_kernel));
-        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-
-    // Set valid region from shape
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
-}
-
-Status NEDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    unsigned int depth_offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
-        depth_offset += input->dimension(2);
-    }
-
-    return Status{};
-}
-
-void NEDepthConcatenateLayer::run()
-{
-    for(unsigned i = 0; i < _num_inputs; ++i)
-    {
-        NEScheduler::get().schedule(_border_handlers_vector[i].get(), Window::DimX);
-        NEScheduler::get().schedule(_concat_kernels_vector[i].get(), Window::DimX);
-    }
-}
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 3d3c6a12fa..42b805794b 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -107,14 +107,14 @@ void NELSTMLayer::configure(const ITensor *input,
     inputs_vector.emplace_back(output_state_in);
 
     _memory_group.manage(&_forget_gate_out2);
-    _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2);
+    _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2, Window::DimX);
 
     std::vector<const ITensor *> weights_vector;
 
     weights_vector.emplace_back(input_to_forget_weights);
     weights_vector.emplace_back(recurrent_to_forget_weights);
 
-    _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6);
+    _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
     _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
@@ -165,7 +165,7 @@ void NELSTMLayer::configure(const ITensor *input,
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
 
-        _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2);
+        _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2, Window::DimX);
 
         _memory_group.manage(&_input_gate_out1);
         _memory_group.manage(&_input_gate_out4);
@@ -234,7 +234,7 @@ void NELSTMLayer::configure(const ITensor *input,
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
 
-    _concat_weights_output.configure(in_out_weights, &_output2);
+    _concat_weights_output.configure(in_out_weights, &_output2, Window::DimX);
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
@@ -308,7 +308,7 @@ void NELSTMLayer::configure(const ITensor *input,
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
-    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
     input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
@@ -383,8 +383,9 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    TensorInfo forget_gate_concat;
-    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, &forget_gate_concat));
+    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
@@ -409,8 +410,9 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorInfo lstm_gate_concat;
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(lstm_weights, &lstm_gate_concat));
+        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
 
         if(lstm_params.has_peephole_opt())
@@ -445,8 +447,9 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorInfo in_out_gate_concat;
-    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(in_out_weights, &in_out_gate_concat));
+    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
 
@@ -485,7 +488,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     inputs_vector_info_raw.push_back(&forget_gate);
     inputs_vector_info_raw.push_back(&output_gate_tmp);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer, Window::DimX));
     return Status{};
 }
 
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
deleted file mode 100644
index 25b5216305..0000000000
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEWidthConcatenateLayer::NEWidthConcatenateLayer()
-    : _concat_kernels_vector(),
-      _num_inputs(0)
-{
-}
-
-template <typename TensorInfoType, typename>
-inline Status NEWidthConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    unsigned int width_offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-        width_offset += input->dimension(0);
-    }
-
-    return Status{};
-}
-template <typename TensorType, typename>
-inline void NEWidthConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output)
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(NEWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    unsigned int width_offset = 0;
-
-    _concat_kernels_vector.resize(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-        width_offset += inputs_vector.at(i)->info()->dimension(0);
-    }
-}
-
-void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
-{
-    configure_internal(std::move(inputs_vector), output);
-}
-
-void NEWidthConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output)
-{
-    configure_internal(std::move(inputs_vector), output);
-}
-
-Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    return validate_internal(inputs_vector, output);
-}
-
-Status NEWidthConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    return validate_internal(inputs_vector, output);
-}
-
-void NEWidthConcatenateLayer::run()
-{
-    for(unsigned i = 0; i < _num_inputs; ++i)
-    {
-        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimY);
-    }
-}
-- 
cgit v1.2.1