aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2023-10-07 23:52:48 +0100
committerGunes Bayir <gunes.bayir@arm.com>2023-10-10 09:48:53 +0000
commit0b72aa4b2abdba7ab48aaa8a45c624ba1e27a411 (patch)
treeea14c31a15c623cfa07db1dba722cd4ae61621b0 /src/core/NEON/kernels
parentc6137d2be4fb781b63831138970146a4eb8550a1 (diff)
downloadComputeLibrary-0b72aa4b2abdba7ab48aaa8a45c624ba1e27a411.tar.gz
Optimize NEStackLayer
Optimize the stack operation in Cpu by leveraging block memcpy. Resolves: COMPMID-6498 Change-Id: I49d79d179f0375a73d654edd59fb33072112569b Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10451 Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/NEStackLayerKernel.cpp196
-rw-r--r--src/core/NEON/kernels/NEStackLayerKernel.h62
2 files changed, 169 insertions, 89 deletions
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index e23b40a9aa..225e4fcfd2 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,6 +33,7 @@
#include "arm_compute/core/Window.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
#include "src/core/helpers/WindowHelpers.h"
namespace arm_compute
@@ -42,9 +43,10 @@ using namespace arm_compute::misc::shape_calculator;
namespace
{
Status validate_arguments(const ITensorInfo *input,
- unsigned int axis,
- unsigned int idx_input,
- unsigned int num_tensors,
+ uint32_t axis,
+ uint32_t idx_input,
+ uint32_t num_tensors,
+ uint32_t rank,
const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -53,6 +55,7 @@ Status validate_arguments(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != rank);
if (output->total_size() != 0)
{
@@ -65,93 +68,162 @@ Status validate_arguments(const ITensorInfo *input,
return Status{};
}
-std::pair<Status, Window>
-validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, uint32_t axis, uint32_t idx_input, uint32_t num_dims)
{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+ Coordinates id_out = id;
+ for (uint32_t i = num_dims; i > axis; --i)
+ {
+ id_out.set(i, id[i - 1]);
+ }
+ id_out.set(axis, idx_input);
+ return id_out;
+}
- // Configure kernel window
- Window win = calculate_max_window(*input);
+void elementwise_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
+{
+ Window window_out;
+ window_out.use_tensor_dimensions(output->info()->tensor_shape());
+
+ const int32_t num_tensors = input.size();
+ const size_t element_size = input[0]->info()->element_size();
+ const uint32_t num_dims = static_cast<uint32_t>(input[0]->info()->num_dimensions());
- return std::make_pair(Status{}, win);
+ for (int32_t idx_input = 0; idx_input < num_tensors; ++idx_input)
+ {
+ Iterator input_it(input[idx_input], window);
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ Coordinates id_out = shift_from_axis_and_replace_coordinate(id, axis, idx_input, num_dims);
+ std::memcpy(output->ptr_to_element(id_out), input_it.ptr(), element_size);
+ },
+ input_it);
+ }
}
-inline Coordinates
-shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+void memcpy_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
{
- constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
- Coordinates id_out = id;
- for (unsigned int i = max_out_coord - 1; i > axis; --i)
+ const int32_t element_size = input[0]->info()->element_size();
+ const int32_t chunk_size = input[0]->info()->tensor_shape().total_size_lower(axis) * element_size;
+ const int32_t num_tensors = input.size();
+ const int32_t out_chunk_step = chunk_size * num_tensors;
+
+ const int32_t start_x = window.x().start();
+ const int32_t end_x = window.x().end();
+ const int32_t start_y = window.y().start();
+ const int32_t end_y = window.y().end();
+
+ uint8_t *out_ptr_base = output->buffer() + output->info()->offset_first_element_in_bytes() + start_x * chunk_size;
+
+ for (int32_t x = start_x; x < end_x; ++x)
{
- id_out.set(i, id[i - 1]);
+ const uint8_t *in_ptr =
+ input[x]->buffer() + input[x]->info()->offset_first_element_in_bytes() + start_y * chunk_size;
+ uint8_t *out_ptr = out_ptr_base + start_y * out_chunk_step;
+
+ for (int32_t y = start_y; y < end_y; ++y)
+ {
+ std::memcpy(out_ptr, in_ptr, chunk_size);
+
+ in_ptr += chunk_size;
+ out_ptr += out_chunk_step;
+ }
+
+ out_ptr_base += chunk_size;
}
- id_out.set(axis, idx_input);
- return id_out;
}
+
} // namespace
-NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(), _output(nullptr), _axis(), _split_dimension(Window::DimY)
{
}
-void NEStackLayerKernel::configure(
- const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- _input = input;
- _output = output;
- _axis = axis;
- _idx_input = idx_input;
+ const int32_t num_tensors = input.size();
+ ARM_COMPUTE_ERROR_ON(num_tensors == 0);
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+ const uint32_t rank = input[0]->info()->num_dimensions();
+ ARM_COMPUTE_UNUSED(rank);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ for (int32_t i = 0; i < num_tensors; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input[i]->info(), axis, i, num_tensors, rank, output->info()));
+ }
+
+ auto_init_if_empty(*output->info(), input[0]->info()->clone()->set_tensor_shape(
+ compute_stack_shape(*input[0]->info(), axis, num_tensors)));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
}
-Status NEStackLayerKernel::validate(const ITensorInfo *input,
- unsigned int axis,
- unsigned int idx_input,
- unsigned int num_tensors,
- const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
- ARM_COMPUTE_RETURN_ON_ERROR(
- validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ const int32_t num_tensors = input.size();
+ const size_t rank = input[0]->num_dimensions();
+
+ for (int32_t i = 0; i < num_tensors; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input[i], axis, i, num_tensors, rank, output));
+ }
+
return Status{};
}
+void NEStackLayerKernel::prepare()
+{
+ // Prepare calculates the window at runtime, in case there is padding being added after configure()
+ const ITensorInfo *input_info = _input[0]->info();
+ const int32_t num_dims = input_info->num_dimensions();
+ const int32_t num_tensors = _input.size();
+
+ // Check if there are any paddings in the input tensors
+ bool has_padding = false;
+ for (const ITensor *in : _input)
+ {
+ if (has_holes(*in->info(), num_dims - 1))
+ {
+ has_padding = true;
+ break;
+ }
+ }
+
+ has_padding = has_padding || has_holes(*_output->info(), num_dims);
+
+ Window win;
+ if (!has_padding)
+ {
+ _stack_fn = memcpy_stack;
+
+ // 2D execution window (X,Y): [Num_tensors, Dimensions >= axis]
+ win.set(Window::DimX, Window::Dimension(0, num_tensors, 1));
+ win.set(Window::DimY, Window::Dimension(0, input_info->tensor_shape().total_size_upper(_axis), 1));
+ }
+ else
+ {
+ _stack_fn = elementwise_stack;
+ win = calculate_max_window(*input_info);
+ }
+
+ INEKernel::configure(win);
+}
+
void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Window window_out;
- window_out.use_tensor_dimensions(_output->info()->tensor_shape());
-
- Iterator input(_input, window);
- Iterator output(_output, window_out);
-
- const int stride_x = _output->info()->strides_in_bytes()[0];
- const int stride_y = _output->info()->num_dimensions() >= 1 ? _output->info()->strides_in_bytes()[1] : 0;
- const int stride_z = _output->info()->num_dimensions() >= 2 ? _output->info()->strides_in_bytes()[2] : 0;
- const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
- const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
-
- execute_window_loop(
- window,
- [&](const Coordinates &id)
- {
- Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
- const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w +
- id_out[4] * stride_k;
- std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
- },
- input);
+ _stack_fn(_input, _output, _axis, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 685812b56d..02ee776ea4 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,16 @@
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
-#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
#include "arm_compute/core/Types.h"
#include "src/core/NEON/INEKernel.h"
+#include <cstdint>
+#include <functional>
+
namespace arm_compute
{
class ITensor;
@@ -57,43 +60,48 @@ public:
*
* @note Supported input tensor rank: up to 4
*
- * @param[in] input Input tensor. Data types supported: All
- * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
- * @param[in] idx_input Index of the input tensor in the list of tensors to stack.
- * All tensors in the list must have the same shape
- * @param[in] num_tensors Number of tensors to stack
- * @param[out] output Output tensor. Data types supported: Same as @p input.
+ * @param[in] input Input tensors. Data types supported: All
+ * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
*
*/
- void configure(
- const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+ void configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
*
* @note Supported input tensor rank: up to 4
*
- * @param[in] input Input tensor info. Data types supported: All
- * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
- * @param[in] idx_input Index of the input tensor in the list of tensors to stack
- * All tensors in the list must have the same shape
- * @param[in] num_tensors Number of tensors to stack
- * @param[in] output Output tensor info. Data types supported: Same as @p input.
+ * @param[in] input Input tensor infos. Data types supported: All
+ * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input,
- unsigned int axis,
- unsigned int idx_input,
- unsigned int num_tensors,
- const ITensorInfo *output);
+ static Status validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output);
+
+ /** Prepare the reshape kernel for execution (Only executed once) for
+ * choosing the window and the algorithm.
+ */
+ void prepare();
// Inherited methods overridden
void run(const Window &window, const ThreadInfo &info) override;
+ /** Get the dimension to split the kernel workload
+ *
+ * @return the split dimension
+ */
+ uint32_t get_split_dimension() const
+ {
+ return _split_dimension;
+ }
+
private:
- const ITensor *_input;
- ITensor *_output;
- unsigned int _axis;
- unsigned int _idx_input;
+ std::vector<ITensor *> _input;
+ ITensor *_output;
+ uint32_t _axis;
+ uint32_t _split_dimension;
+
+ std::function<void(const std::vector<ITensor *> &, ITensor *, uint32_t, const Window &)> _stack_fn{};
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H