From 338595bca8ab60492f10626860acb1ab3722b1ce Mon Sep 17 00:00:00 2001
From: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Date: Fri, 28 Jun 2019 14:09:53 +0100
Subject: COMPMID-2234 : Add support for axis 3 in NE/CLConcatenateLayer

Change-Id: Ic86f89ece3afe72809bc69c6de6fee7d21daa1d4
Signed-off-by: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1440
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/CL/CLKernels.h                    |   1 +
 .../CL/kernels/CLBatchConcatenateLayerKernel.h     |  83 ++++++++++
 arm_compute/core/NEON/NEKernels.h                  |   1 +
 .../NEON/kernels/NEBatchConcatenateLayerKernel.h   |  90 ++++++++++
 .../runtime/CL/functions/CLConcatenateLayer.h      |   5 +-
 .../runtime/NEON/functions/NEConcatenateLayer.h    |   5 +-
 src/core/CL/CLKernelLibrary.cpp                    |   2 +-
 src/core/CL/cl_kernels/concatenate.cl              |   2 +-
 .../CL/kernels/CLBatchConcatenateLayerKernel.cpp   | 168 +++++++++++++++++++
 .../CL/kernels/CLDepthConcatenateLayerKernel.cpp   |   2 +-
 .../NEON/kernels/NEBatchConcatenateLayerKernel.cpp | 181 +++++++++++++++++++++
 src/runtime/CL/functions/CLConcatenateLayer.cpp    |  21 +++
 src/runtime/NEON/functions/NEConcatenateLayer.cpp  |  13 ++
 tests/validation/CL/BatchConcatenateLayer.cpp      | 170 +++++++++++++++++++
 tests/validation/NEON/BatchConcatenateLayer.cpp    | 154 ++++++++++++++++++
 tests/validation/reference/ConcatenateLayer.cpp    |  10 ++
 16 files changed, 901 insertions(+), 7 deletions(-)
 create mode 100644 arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
 create mode 100644 src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
 create mode 100644 src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
 create mode 100644 tests/validation/CL/BatchConcatenateLayer.cpp
 create mode 100644 tests/validation/NEON/BatchConcatenateLayer.cpp

diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index cd5612c9ae..8fbc4770b0 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
 #include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
new file mode 100644
index 0000000000..69571ad499
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the batch concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLBatchConcatenateLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLBatchConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchConcatenateLayerKernel(const CLBatchConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchConcatenateLayerKernel &operator=(const CLBatchConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBatchConcatenateLayerKernel(CLBatchConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBatchConcatenateLayerKernel &operator=(CLBatchConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLBatchConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]     batch_offset The offset on axis # 3.
+     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] batch_offset The offset on axis # 3.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    unsigned int     _batch_offset;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 4023d82107..e41f299611 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
new file mode 100644
index 0000000000..edd9470a3c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the batch concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEBatchConcatenateLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBatchConcatenateLayerKernel";
+    }
+    /** Default constructor */
+    NEBatchConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchConcatenateLayerKernel(const NEBatchConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchConcatenateLayerKernel &operator=(const NEBatchConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBatchConcatenateLayerKernel(NEBatchConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBatchConcatenateLayerKernel &operator=(NEBatchConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]     batch_offset The offset on axis # 3.
+     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensor *input, unsigned int batch_offset, ITensor *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref NEBatchConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in] batch_offset The offset on axis # 3.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    using BatchConcatFunction = void(const ITensor *in, ITensor *out, int batch_offset, const Window &window);
+
+private:
+    BatchConcatFunction *_func;
+    const ITensor       *_input;
+    ITensor             *_output;
+    unsigned int         _batch_offset;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index c56fc117b9..b69930c7d3 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -44,6 +44,7 @@ class Status;
  * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
  * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
  * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
+ * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
  */
 class CLConcatenateLayer : public IFunction
 {
@@ -57,7 +58,7 @@ public:
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
-     * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1 and 2.
+     * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
     void configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
@@ -67,7 +68,7 @@ public:
      *
      * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1 and 2.
+     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index 8c97efc4f0..953e3fa641 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -45,6 +45,7 @@ class Status;
  * -# @ref NEWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
  * -# @ref NEHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
  * -# @ref NEDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
+ * -# @ref NEBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
  */
 class NEConcatenateLayer : public IFunction
 {
@@ -58,7 +59,7 @@ public:
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
-     * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1 and 2.
+     * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
     void configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis);
     void configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis);
@@ -69,7 +70,7 @@ public:
      *
      * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/F16/F32.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1 and 2.
+     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      *
      * @return a status
      */
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index c0875bebcd..db57bb93a6 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -188,7 +188,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "compare_less_quantized", "comparisons.cl" },
     { "compare_lessequal", "comparisons.cl" },
     { "compare_lessequal_quantized", "comparisons.cl" },
-    { "concatenate_depth", "concatenate.cl" },
+    { "concatenate", "concatenate.cl" },
     { "concatenate_width", "concatenate.cl" },
     { "concatenate_height", "concatenate.cl" },
     { "concatenate_width_x2", "concatenate.cl" },
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index e365683958..5ccf746a4e 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -406,7 +406,7 @@ __kernel void concatenate_height(
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
  */
-__kernel void concatenate_depth(
+__kernel void concatenate(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     int offset)
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
new file mode 100644
index 0000000000..86bf366346
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
+using namespace arm_compute;
+
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(batch_offset);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    // The window needs to be based on output, except for the batch size
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    // The total batch size is the concatenation of the batch size of the inputs
+    win.set(3, Window::Dimension(0, input->tensor_shape()[3], 1));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+Status validate_arguments(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimZ) != output->dimension(Window::DimZ));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) + batch_offset > output->dimension(3));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, input, output);
+
+    return Status{};
+}
+} // namespace
+
+CLBatchConcatenateLayerKernel::CLBatchConcatenateLayerKernel()
+    : _input(nullptr), _output(nullptr), _batch_offset(0)
+{
+}
+
+void CLBatchConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), batch_offset, output->info()));
+
+    _input        = input;
+    _output       = output;
+    _batch_offset = batch_offset;
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), batch_offset, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    ICLKernel::configure_internal(std::get<1>(win_config));
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_";
+    _config_id += support::cpp11::to_string(3);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(batch_offset);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(3));
+}
+
+Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
+                                               unsigned int                    batch_offset,
+                                               const arm_compute::ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, batch_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), batch_offset, output->clone().get()).first);
+    return Status{};
+}
+
+void CLBatchConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    const int offset_to_first_elements_in_bytes = _batch_offset * _output->info()->strides_in_bytes()[3];
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 5e1bbe944f..40b633b273 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -109,7 +109,7 @@ void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned i
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts.options()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate", build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), depth_offset, output->info());
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
new file mode 100644
index 0000000000..4263892c50
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T>
+void batch_concat(const ITensor *in, ITensor *out, int batch_offset, const Window &window)
+{
+    // Offset input
+    uint8_t *input_ptr = in->buffer() + in->info()->offset_first_element_in_bytes();
+
+    // Offset output
+    uint8_t *output_ptr = out->buffer() + out->info()->offset_first_element_in_bytes() + batch_offset * out->info()->strides_in_bytes()[3];
+
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    const DataType                dt           = in->info()->data_type();
+    const UniformQuantizationInfo input_qinfo  = in->info()->quantization_info().uniform();
+    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
+    if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
+    {
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const uint8_t *>(input_ptr + input.offset());
+            const auto out_ptr = reinterpret_cast<uint8_t *>(output_ptr + output.offset());
+            vst1q_u8(out_ptr, vquantize(vdequantize(vld1q_u8(in_ptr), input_qinfo), output_qinfo));
+        },
+        input, output);
+    }
+    else
+    {
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_ptr + input.offset());
+            const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
+
+            wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
+        },
+        input, output);
+    }
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(batch_offset);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    // The window needs to be based on input as we copy all the batchs of input
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    win.set(3, Window::Dimension(0, input->tensor_shape()[3], 1));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Status validate_arguments(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimZ) != output->dimension(Window::DimZ));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) + batch_offset > output->dimension(3));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, input, output);
+
+    return Status{};
+}
+} // namespace
+
+NEBatchConcatenateLayerKernel::NEBatchConcatenateLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _batch_offset(0)
+{
+}
+
+void NEBatchConcatenateLayerKernel::configure(const ITensor *input, unsigned int batch_offset, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), batch_offset, output->info()));
+
+    _func         = nullptr;
+    _input        = input;
+    _output       = output;
+    _batch_offset = batch_offset;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::S8:
+        case DataType::U8:
+        case DataType::QASYMM8:
+            _func = &batch_concat<uint8_t>;
+            break;
+        case DataType::S16:
+        case DataType::U16:
+        case DataType::F16:
+            _func = &batch_concat<uint16_t>;
+            break;
+        case DataType::S32:
+        case DataType::U32:
+        case DataType::F32:
+            _func = &batch_concat<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), batch_offset, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
+                                               unsigned int                    batch_offset,
+                                               const arm_compute::ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, batch_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), batch_offset, output->clone().get()).first);
+    return Status{};
+}
+
+void NEBatchConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, _batch_offset, window);
+}
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0594a17a7a..1d396f5ebf 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
+#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
@@ -124,6 +125,17 @@ void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector
             }
             break;
         }
+        case 3:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Axis not supported");
     }
@@ -184,6 +196,15 @@ Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vec
             }
             break;
         }
+        case 3:
+        {
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(CLBatchConcatenateLayerKernel::validate(input, offset, output));
+                offset += input->dimension(axis);
+            }
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Axis not supported");
     }
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index d338493e51..9a70d32843 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
+#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
@@ -112,6 +113,13 @@ void NEConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_v
                 _concat_kernels.emplace_back(std::move(kernel));
                 break;
             }
+            case 3:
+            {
+                auto kernel = support::cpp14::make_unique<NEBatchConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
             default:
                 ARM_COMPUTE_ERROR("Axis not supported");
         }
@@ -146,6 +154,11 @@ Status NEConcatenateLayer::validate_internal(const std::vector<TensorInfoType *>
                 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, offset, output));
                 break;
             }
+            case 3:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(NEBatchConcatenateLayerKernel::validate(input, offset, output));
+                break;
+            }
             default:
                 ARM_COMPUTE_ERROR("Axis not supported");
         }
diff --git a/tests/validation/CL/BatchConcatenateLayer.cpp b/tests/validation/CL/BatchConcatenateLayer.cpp
new file mode 100644
index 0000000000..b789569155
--- /dev/null
+++ b/tests/validation/CL/BatchConcatenateLayer.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ConcatenateLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(BatchConcatenateLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+        framework::dataset::make("InputInfo1", {  TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Mismatching data type input/output
+                                                  TensorInfo(TensorShape(20U, 27U, 4U, 4U), 1, DataType::F32), // Mismatching x dimension
+                                                  TensorInfo(TensorShape(23U, 26U, 4U, 3U), 1, DataType::F32), // Mismatching y dim
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 3U), 1, DataType::F32), // Mismatching z dim
+                                                  TensorInfo(TensorShape(16U, 27U, 3U, 6U), 1, DataType::F32)
+        }),
+        framework::dataset::make("InputInfo2", {  TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 3U, 3U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(16U, 27U, 3U, 6U), 1, DataType::F32)
+        })),
+        framework::dataset::make("OutputInfo", {  TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F16),
+                                                  TensorInfo(TensorShape(23U, 12U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 20U, 4U, 3U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(16U, 27U, 3U, 12U), 1, DataType::F32)
+        })),
+        framework::dataset::make("Expected", { false, false, false, false, true })),
+        input_info1, input_info2, output_info,expected)
+{
+    std::vector<TensorInfo> inputs_vector_info;
+    inputs_vector_info.emplace_back(std::move(input_info1));
+    inputs_vector_info.emplace_back(std::move(input_info2));
+
+    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    inputs_vector_info_raw.reserve(inputs_vector_info.size());
+    for(auto &input : inputs_vector_info)
+    {
+        inputs_vector_info_raw.emplace_back(&input);
+    }
+
+    bool is_valid = bool(CLConcatenateLayer::validate(inputs_vector_info_raw, &output_info.clone()->set_is_resizable(false), 3));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_CASE(Configuration, framework::DatasetMode::ALL)
+{
+    // Create tensors
+    CLTensor src1 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
+    CLTensor src2 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
+    CLTensor src3 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
+    CLTensor dst;
+
+    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(src3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLConcatenateLayer concat_layer;
+
+    concat_layer.configure({ &src1, &src2, &src3 }, &dst, 3);
+}
+template <typename T>
+using CLBatchConcatenateLayerFixture = ConcatenateLayerValidationFixture<CLTensor, ICLTensor, CLAccessor, CLConcatenateLayer, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLBatchConcatenateLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small3DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                  framework::dataset::make("DataType",
+                                                                                                                          DataType::F16)),
+                                                                                                                  framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLBatchConcatenateLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::Large3DShapes(), datasets::Small4DShapes()),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F16)),
+                                                                                                                framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLBatchConcatenateLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small3DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::F32)),
+                                                                                                                   framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLBatchConcatenateLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::ConcatenateLayerShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
+                                                                                                                 framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLBatchConcatenateLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small3DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                             DataType::QASYMM8)),
+                                                                                                                     framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLBatchConcatenateLayerFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::ConcatenateLayerShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::QASYMM8)),
+                                                                                                                   framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/BatchConcatenateLayer.cpp b/tests/validation/NEON/BatchConcatenateLayer.cpp
new file mode 100644
index 0000000000..f95663dbd3
--- /dev/null
+++ b/tests/validation/NEON/BatchConcatenateLayer.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ConcatenateLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(BatchConcatenateLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+        framework::dataset::make("InputInfo1", {  TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Mismatching data type input/output
+                                                  TensorInfo(TensorShape(20U, 27U, 4U, 4U), 1, DataType::F32), // Mismatching x dimension
+                                                  TensorInfo(TensorShape(23U, 26U, 4U, 3U), 1, DataType::F32), // Mismatching y dim
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 3U), 1, DataType::F32), // Mismatching z dim
+                                                  TensorInfo(TensorShape(16U, 27U, 3U, 6U), 1, DataType::F32)
+        }),
+        framework::dataset::make("InputInfo2", {  TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 3U, 3U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(16U, 27U, 3U, 6U), 1, DataType::F32)
+        })),
+        framework::dataset::make("OutputInfo", {  TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F16),
+                                                  TensorInfo(TensorShape(23U, 12U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 27U, 4U, 4U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(23U, 20U, 4U, 3U), 1, DataType::F32),
+                                                  TensorInfo(TensorShape(16U, 27U, 3U, 12U), 1, DataType::F32)
+        })),
+        framework::dataset::make("Expected", { false, false, false, false, true })),
+        input_info1, input_info2, output_info,expected)
+{
+    std::vector<TensorInfo> inputs_vector_info;
+    inputs_vector_info.emplace_back(std::move(input_info1));
+    inputs_vector_info.emplace_back(std::move(input_info2));
+
+    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    inputs_vector_info_raw.reserve(inputs_vector_info.size());
+    for(auto &input : inputs_vector_info)
+    {
+        inputs_vector_info_raw.emplace_back(&input);
+    }
+
+    bool is_valid = bool(NEConcatenateLayer::validate(inputs_vector_info_raw, &output_info.clone()->set_is_resizable(false), 3));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEBatchConcatenateLayerFixture = ConcatenateLayerValidationFixture<Tensor, ITensor, Accessor, NEConcatenateLayer, T>;
+
+TEST_SUITE(Float)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchConcatenateLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small2DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                  framework::dataset::make("DataType",
+                                                                                                                          DataType::F16)),
+                                                                                                                  framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchConcatenateLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::ConcatenateLayerShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::F16)),
+                                                                                                                framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchConcatenateLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small3DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::F32)),
+                                                                                                                   framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchConcatenateLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::ConcatenateLayerShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
+                                                                                                                 framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchConcatenateLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small3DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                             DataType::QASYMM8)),
+                                                                                                                     framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchConcatenateLayerFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::ConcatenateLayerShapes(),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::QASYMM8)),
+                                                                                                                   framework::dataset::make("Axis", 3)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ConcatenateLayer.cpp b/tests/validation/reference/ConcatenateLayer.cpp
index 6c90d74a0f..aa74ca2474 100644
--- a/tests/validation/reference/ConcatenateLayer.cpp
+++ b/tests/validation/reference/ConcatenateLayer.cpp
@@ -127,6 +127,16 @@ SimpleTensor<T> concatenate_layer(std::vector<SimpleTensor<T>> &srcs, SimpleTens
             dst = reference::permute<T>(dst, PermutationVector(2U, 1U, 0U));
             return reference::permute<T>(widthconcatenate_layer(srcs, dst), PermutationVector(2U, 1U, 0U));
         }
+        case 3:
+        {
+            for(auto &t : srcs)
+            {
+                t = reference::permute<T>(t, PermutationVector(3U, 2U, 1U, 0U));
+            }
+            dst      = reference::permute<T>(dst, PermutationVector(3U, 2U, 1U, 0U));
+            auto ret = reference::permute<T>(widthconcatenate_layer(srcs, dst), PermutationVector(3U, 2U, 1U, 0U));
+            return ret;
+        }
         default:
         {
             ARM_COMPUTE_ERROR("Not supported");
-- 
cgit v1.2.1