From 0b72aa4b2abdba7ab48aaa8a45c624ba1e27a411 Mon Sep 17 00:00:00 2001
From: Gunes Bayir <gunes.bayir@arm.com>
Date: Sat, 7 Oct 2023 23:52:48 +0100
Subject: Optimize NEStackLayer

Optimize the stack operation in Cpu by leveraging block memcpy.

Resolves: COMPMID-6498

Change-Id: I49d79d179f0375a73d654edd59fb33072112569b
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10451
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/runtime/NEON/functions/NEStackLayer.h  |  13 +-
 docs/user_guide/release_version_and_change_log.dox |   1 +
 src/core/NEON/kernels/NEStackLayerKernel.cpp       | 196 +++++++++++++------
 src/core/NEON/kernels/NEStackLayerKernel.h         |  62 +++---
 src/runtime/NEON/functions/NEStackLayer.cpp        |  31 +--
 tests/validation/NEON/StackLayer.cpp               | 211 ++++++++++++---------
 tests/validation/fixtures/StackLayerFixture.h      |  34 +++-
 7 files changed, 335 insertions(+), 213 deletions(-)
diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index ae4e468f21..98dacde0c1 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NESTACKLAYER_H
-#define ARM_COMPUTE_NESTACKLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -91,9 +91,8 @@ public:
     void run() override;
 
 private:
-    std::vector<ITensor *>                           _input;
-    std::vector<std::unique_ptr<NEStackLayerKernel>> _stack_kernels;
-    unsigned int                                     _num_inputs;
+    std::unique_ptr<NEStackLayerKernel> _stack_kernel;
+    bool                                _is_prepared;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 882244d2f2..d1429b61d7 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -55,6 +55,7 @@ v23.11 Public major release
  - Performance optimizations:
    - Optimize @ref cpu::CpuReshape
    - Optimize @ref opencl::ClTranspose
+   - Optimize @ref NEStackLayer
  - Add new OpenCL™ kernels:
    - @ref opencl::kernels::ClMatMulLowpNativeMMULKernel support for QASYMM8 and QASYMM8_SIGNED, with batch support
  - Deprecate support for Bfloat16 in @ref cpu::CpuCast.
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index e23b40a9aa..225e4fcfd2 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,7 @@
 #include "arm_compute/core/Window.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
@@ -42,9 +43,10 @@ using namespace arm_compute::misc::shape_calculator;
 namespace
 {
 Status validate_arguments(const ITensorInfo *input,
-                          unsigned int       axis,
-                          unsigned int       idx_input,
-                          unsigned int       num_tensors,
+                          uint32_t           axis,
+                          uint32_t           idx_input,
+                          uint32_t           num_tensors,
+                          uint32_t           rank,
                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -53,6 +55,7 @@ Status validate_arguments(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != rank);
 
     if (output->total_size() != 0)
     {
@@ -65,93 +68,162 @@ Status validate_arguments(const ITensorInfo *input,
     return Status{};
 }
 
-std::pair<Status, Window>
-validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, uint32_t axis, uint32_t idx_input, uint32_t num_dims)
 {
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+    Coordinates id_out = id;
+    for (uint32_t i = num_dims; i > axis; --i)
+    {
+        id_out.set(i, id[i - 1]);
+    }
+    id_out.set(axis, idx_input);
+    return id_out;
+}
 
-    // Configure kernel window
-    Window win = calculate_max_window(*input);
+void elementwise_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
+{
+    Window window_out;
+    window_out.use_tensor_dimensions(output->info()->tensor_shape());
+
+    const int32_t  num_tensors  = input.size();
+    const size_t   element_size = input[0]->info()->element_size();
+    const uint32_t num_dims     = static_cast<uint32_t>(input[0]->info()->num_dimensions());
 
-    return std::make_pair(Status{}, win);
+    for (int32_t idx_input = 0; idx_input < num_tensors; ++idx_input)
+    {
+        Iterator input_it(input[idx_input], window);
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                Coordinates id_out = shift_from_axis_and_replace_coordinate(id, axis, idx_input, num_dims);
+                std::memcpy(output->ptr_to_element(id_out), input_it.ptr(), element_size);
+            },
+            input_it);
+    }
 }
 
-inline Coordinates
-shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+void memcpy_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
 {
-    constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
-    Coordinates   id_out        = id;
-    for (unsigned int i = max_out_coord - 1; i > axis; --i)
+    const int32_t element_size   = input[0]->info()->element_size();
+    const int32_t chunk_size     = input[0]->info()->tensor_shape().total_size_lower(axis) * element_size;
+    const int32_t num_tensors    = input.size();
+    const int32_t out_chunk_step = chunk_size * num_tensors;
+
+    const int32_t start_x = window.x().start();
+    const int32_t end_x   = window.x().end();
+    const int32_t start_y = window.y().start();
+    const int32_t end_y   = window.y().end();
+
+    uint8_t *out_ptr_base = output->buffer() + output->info()->offset_first_element_in_bytes() + start_x * chunk_size;
+
+    for (int32_t x = start_x; x < end_x; ++x)
     {
-        id_out.set(i, id[i - 1]);
+        const uint8_t *in_ptr =
+            input[x]->buffer() + input[x]->info()->offset_first_element_in_bytes() + start_y * chunk_size;
+        uint8_t *out_ptr = out_ptr_base + start_y * out_chunk_step;
+
+        for (int32_t y = start_y; y < end_y; ++y)
+        {
+            std::memcpy(out_ptr, in_ptr, chunk_size);
+
+            in_ptr += chunk_size;
+            out_ptr += out_chunk_step;
+        }
+
+        out_ptr_base += chunk_size;
     }
-    id_out.set(axis, idx_input);
-    return id_out;
 }
+
 } // namespace
 
-NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(), _output(nullptr), _axis(), _split_dimension(Window::DimY)
 {
 }
 
-void NEStackLayerKernel::configure(
-    const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    _input     = input;
-    _output    = output;
-    _axis      = axis;
-    _idx_input = idx_input;
+    const int32_t num_tensors = input.size();
+    ARM_COMPUTE_ERROR_ON(num_tensors == 0);
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+    const uint32_t rank = input[0]->info()->num_dimensions();
+    ARM_COMPUTE_UNUSED(rank);
 
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    for (int32_t i = 0; i < num_tensors; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+        ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input[i]->info(), axis, i, num_tensors, rank, output->info()));
+    }
+
+    auto_init_if_empty(*output->info(), input[0]->info()->clone()->set_tensor_shape(
+                                            compute_stack_shape(*input[0]->info(), axis, num_tensors)));
+
+    _input  = input;
+    _output = output;
+    _axis   = axis;
 }
 
-Status NEStackLayerKernel::validate(const ITensorInfo *input,
-                                    unsigned int       axis,
-                                    unsigned int       idx_input,
-                                    unsigned int       num_tensors,
-                                    const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    const int32_t num_tensors = input.size();
+    const size_t  rank        = input[0]->num_dimensions();
+
+    for (int32_t i = 0; i < num_tensors; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input[i], axis, i, num_tensors, rank, output));
+    }
+
     return Status{};
 }
 
+void NEStackLayerKernel::prepare()
+{
+    // Prepare calculates the window at runtime, in case there is padding being added after configure()
+    const ITensorInfo *input_info  = _input[0]->info();
+    const int32_t      num_dims    = input_info->num_dimensions();
+    const int32_t      num_tensors = _input.size();
+
+    // Check if there are any paddings in the input tensors
+    bool has_padding = false;
+    for (const ITensor *in : _input)
+    {
+        if (has_holes(*in->info(), num_dims - 1))
+        {
+            has_padding = true;
+            break;
+        }
+    }
+
+    has_padding = has_padding || has_holes(*_output->info(), num_dims);
+
+    Window win;
+    if (!has_padding)
+    {
+        _stack_fn = memcpy_stack;
+
+        // 2D execution window (X,Y): [Num_tensors, Dimensions >= axis]
+        win.set(Window::DimX, Window::Dimension(0, num_tensors, 1));
+        win.set(Window::DimY, Window::Dimension(0, input_info->tensor_shape().total_size_upper(_axis), 1));
+    }
+    else
+    {
+        _stack_fn = elementwise_stack;
+        win       = calculate_max_window(*input_info);
+    }
+
+    INEKernel::configure(win);
+}
+
 void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window window_out;
-    window_out.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Iterator input(_input, window);
-    Iterator output(_output, window_out);
-
-    const int stride_x = _output->info()->strides_in_bytes()[0];
-    const int stride_y = _output->info()->num_dimensions() >= 1 ? _output->info()->strides_in_bytes()[1] : 0;
-    const int stride_z = _output->info()->num_dimensions() >= 2 ? _output->info()->strides_in_bytes()[2] : 0;
-    const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
-    const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
-
-    execute_window_loop(
-        window,
-        [&](const Coordinates &id)
-        {
-            Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-            const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w +
-                            id_out[4] * stride_k;
-            std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
-        },
-        input);
+    _stack_fn(_input, _output, _axis, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 685812b56d..02ee776ea4 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,13 +22,16 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
-#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
 
 #include "src/core/NEON/INEKernel.h"
 
+#include <cstdint>
+#include <functional>
+
 namespace arm_compute
 {
 class ITensor;
@@ -57,43 +60,48 @@ public:
      *
      * @note Supported input tensor rank: up to 4
      *
-     * @param[in]  input       Input tensor. Data types supported: All
-     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
-     *                         All tensors in the list must have the same shape
-     * @param[in]  num_tensors Number of tensors to stack
-     * @param[out] output      Output tensor. Data types supported: Same as @p input.
+     * @param[in]  input  Input tensors. Data types supported: All
+     * @param[in]  axis   The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[out] output Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(
-        const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+    void configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
      *
-     * @param[in] input       Input tensor info. Data types supported: All
-     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
-     *                        All tensors in the list must have the same shape
-     * @param[in] num_tensors Number of tensors to stack
-     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
+     * @param[in] input  Input tensor infos. Data types supported: All
+     * @param[in] axis   The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           unsigned int       axis,
-                           unsigned int       idx_input,
-                           unsigned int       num_tensors,
-                           const ITensorInfo *output);
+    static Status validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output);
+
+    /** Prepare the reshape kernel for execution (Only executed once) for
+     *  choosing the window and the algorithm.
+     */
+    void prepare();
 
     // Inherited methods overridden
     void run(const Window &window, const ThreadInfo &info) override;
 
+    /** Get the dimension to split the kernel workload
+     *
+     * @return the split dimension
+     */
+    uint32_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _axis;
-    unsigned int   _idx_input;
+    std::vector<ITensor *> _input;
+    ITensor               *_output;
+    uint32_t               _axis;
+    uint32_t               _split_dimension;
+
+    std::function<void(const std::vector<ITensor *> &, ITensor *, uint32_t, const Window &)> _stack_fn{};
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 03e7026691..2f88ffca2a 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@ namespace arm_compute
 NEStackLayer::~NEStackLayer() = default;
 
 NEStackLayer::NEStackLayer() // NOLINT
-    : _input(), _stack_kernels(), _num_inputs(0)
+    : _stack_kernel(std::make_unique<NEStackLayerKernel>()), _is_prepared(false)
 {
 }
 
@@ -47,17 +47,10 @@ void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITen
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output);
 
-    _num_inputs = input.size();
-    _stack_kernels.resize(_num_inputs);
-
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for (unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _stack_kernels[i] = std::make_unique<NEStackLayerKernel>();
-        _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
-    }
+    _stack_kernel->configure(input, axis_u, output);
 }
 
 Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
@@ -69,24 +62,20 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
     const size_t       rank   = input[0]->num_dimensions();
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
 
-    const unsigned int num_inputs = input.size();
-
-    for (unsigned int i = 0; i < num_inputs; i++)
-    {
-        // All the tensors must have the same rank
-        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
-        // Validate Kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
-    }
+    // Validate Kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input, axis_u, output));
 
     return Status{};
 }
 
 void NEStackLayer::run()
 {
-    for (unsigned i = 0; i < _num_inputs; i++)
+    if (!_is_prepared)
     {
-        NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
+        _stack_kernel->prepare();
+        _is_prepared = true;
     }
+
+    NEScheduler::get().schedule(_stack_kernel.get(), _stack_kernel->get_split_dimension());
 }
 } // namespace arm_compute
diff --git a/tests/validation/NEON/StackLayer.cpp b/tests/validation/NEON/StackLayer.cpp
index d88f713ccd..3828010c7b 100644
--- a/tests/validation/NEON/StackLayer.cpp
+++ b/tests/validation/NEON/StackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,69 +44,74 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
 namespace
 {
 // *INDENT-OFF*
 // clang-format off
 /** Data types */
-const auto data_types = framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
+const auto data_types = make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
 
 /** Num tensors values to test */
-const auto n_values = framework::dataset::make("NumTensors", { 3, 4 });
+const auto n_values = make("NumTensors", { 3, 4 });
 
 /** Shapes 1D to test */
-const auto shapes_1d_small = combine(datasets::Small1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_small = combine(datasets::Small1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_small = combine(datasets::Small2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_small = combine(datasets::Small2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_small = combine(datasets::Small3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_small = combine(datasets::Small3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_small = combine(datasets::Small4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_small = combine(datasets::Small4DShapes(), make("Axis", -4, 5));
 
 /** Shapes 1D to test */
-const auto shapes_1d_large = combine(datasets::Large1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_large = combine(datasets::Large1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_large = combine(datasets::Medium2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_large = combine(datasets::Medium2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_large = combine(datasets::Medium3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_large = combine(datasets::Medium3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_large = combine(datasets::Medium4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_large = combine(datasets::Medium4DShapes(), make("Axis", -4, 5));
 } // namespace
 
 /** Fixture to use */
 template<typename T>
 using NEStackLayerFixture = StackLayerValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
 
+template<typename T>
+using NEStackLayerWithPaddingFixture = StackLayerWithPaddingValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
+
 using namespace arm_compute::misc::shape_calculator;
 
 TEST_SUITE(NEON)
 TEST_SUITE(StackLayer)
 
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                                      framework::dataset::make("InputInfo",
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+make("InputInfo",
 {
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::U8) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(2U, 3U), 1, DataType::S32) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::S32) },
 }),
-framework::dataset::make("OutputInfo",
+make("OutputInfo",
 {
     TensorInfo(TensorShape(1U, 9U, 8U), 1, DataType::U8),   // Passes, stack 1 tensor on x axis
     TensorInfo(TensorShape(1U, 3U, 2U), 1, DataType::U8),   // Passes, stack 3 tensors on y axis
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::S32),  // fails axis <  (- input's rank)
     TensorInfo(TensorShape(3U, 7U, 5U), 1, DataType::S32),  // fails, input dimensions > 4
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::U8),   // fails mismatching data types
-})),
-framework::dataset::make("Axis", { -3, 1, -4, -3, 1 })),
-framework::dataset::make("Expected", { true, true, false, false, false })),
+}),
+make("Axis", { -3, 1, -4, -3, 1 }),
+make("Expected", { true, true, false, false, false })),
 input_info, output_info, axis, expected)
 {
     std::vector<TensorInfo>    ti(input_info);
@@ -121,18 +126,18 @@ input_info, output_info, axis, expected)
 TEST_SUITE(Shapes1D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -141,18 +146,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -161,18 +166,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,18 +188,18 @@ TEST_SUITE_END() // Shapes1D
 TEST_SUITE(Shapes2D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -203,18 +208,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -223,18 +228,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -245,18 +250,18 @@ TEST_SUITE_END() // Shapes2D
 TEST_SUITE(Shapes3D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -265,18 +270,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -285,18 +290,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -307,18 +312,29 @@ TEST_SUITE_END() // Shapes3D
 TEST_SUITE(Shapes4D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+// Testing the case with padding for only 4d shapes and for one data type. This is because the underlying code
+// path depends only on the padding, which isn't affected by the shapes or data types.
+FIXTURE_DATA_TEST_CASE(RunSmallWithPadding, NEStackLayerWithPaddingFixture<int>, framework::DatasetMode::ALL,
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -327,18 +343,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -347,24 +363,37 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes4D
+
+TEST_SUITE(HighDimensional)
+// The Cpu implementation supports tensors of size 4D+, but reference implementation does not.
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, NEStackLayerFixture<char>, framework::DatasetMode::DISABLED,
+    combine(make("Shape", { TensorShape{2U, 3U, 4U, 5U, 3U} }),
+            make("Axis", { 5, 0, -3, 2 }),
+            make("DataType", { DataType::S8 }),
+            make("NumTensors", { 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // HighDimensional
 TEST_SUITE_END() // StackLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/fixtures/StackLayerFixture.h b/tests/validation/fixtures/StackLayerFixture.h
index 7320a032bd..7dd8fe47dc 100644
--- a/tests/validation/fixtures/StackLayerFixture.h
+++ b/tests/validation/fixtures/StackLayerFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
@@ -54,7 +54,7 @@ class StackLayerValidationFixture : public framework::Fixture
 public:
     void setup(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
     {
-        _target    = compute_target(shape_src, axis, data_type, num_tensors);
+        _target    = compute_target(shape_src, axis, data_type, num_tensors, false /* add_x_padding */);
         _reference = compute_reference(shape_src, axis, data_type, num_tensors);
     }
 
@@ -65,7 +65,7 @@ protected:
         library->fill_tensor_uniform(tensor, i);
     }
 
-    TensorType compute_target(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
+    TensorType compute_target(TensorShape shape_src, int axis, DataType data_type, int num_tensors, bool add_x_padding)
     {
         std::vector<TensorType>           tensors(num_tensors);
         std::vector<AbstractTensorType *> src(num_tensors);
@@ -90,6 +90,11 @@ protected:
         // Allocate and fill the input tensors
         for(int i = 0; i < num_tensors; ++i)
         {
+            if(add_x_padding)
+            {
+                add_padding_x({&tensors[i]}, DataLayout::NHWC);
+            }
+
             ARM_COMPUTE_ASSERT(tensors[i].info()->is_resizable());
             tensors[i].allocator()->allocate();
             ARM_COMPUTE_ASSERT(!tensors[i].info()->is_resizable());
@@ -98,6 +103,11 @@ protected:
             fill(AccessorType(tensors[i]), i);
         }
 
+        if(add_x_padding)
+        {
+            add_padding_x({&dst}, DataLayout::NHWC);
+        }
+
         // Allocate output tensor
         dst.allocator()->allocate();
 
@@ -131,7 +141,21 @@ protected:
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AbstractTensorType, typename AccessorType, typename FunctionType, typename T>
+class StackLayerWithPaddingValidationFixture :
+    public StackLayerValidationFixture<TensorType, AbstractTensorType, AccessorType, FunctionType, T>
+{
+public:
+    using Parent = StackLayerValidationFixture<TensorType, AbstractTensorType, AccessorType, FunctionType, T>;
+
+    void setup(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
+    {
+        Parent::_target    = Parent::compute_target(shape_src, axis, data_type, num_tensors, true /* add_x_padding */);
+        Parent::_reference = Parent::compute_reference(shape_src, axis, data_type, num_tensors);
+    }
+};
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
-- 
cgit v1.2.1