aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorViet-Hoa Do <viet-hoa.do@arm.com>2023-11-13 17:20:45 +0000
committerViet-Hoa Do <viet-hoa.do@arm.com>2023-12-07 09:42:47 +0000
commit47370943471c98b5ed4c954b350b925d368a810e (patch)
treef5fd7d4c4a51b538874632607887b93d8ece3c07 /src/core/NEON/kernels
parent17e116e90e6b962a09c133c646b6ad7884e94693 (diff)
downloadComputeLibrary-47370943471c98b5ed4c954b350b925d368a810e.tar.gz
Optimize CPU depth-to-space
Resolves: COMPMID-6622 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: Ibac276618bdda125dcbb9c851c547f12739b15b4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10749 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp150
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h20
2 files changed, 105 insertions, 65 deletions
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index de0079ee60..e0eb5cf202 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+#include "arm_compute/core/CoreTypes.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
@@ -31,13 +32,10 @@
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/depth_to_space/list.h"
-#include <arm_neon.h>
#include <cstdint>
-using namespace arm_compute::misc::shape_calculator;
-
namespace arm_compute
{
namespace
@@ -70,15 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
} // namespace
NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
+ : _input(nullptr),
+ _output(nullptr),
+ _block_shape(),
+ _data_layout(DataLayout::UNKNOWN),
+ _split_dimension(Window::DimY)
{
}
void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape =
- compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+ TensorShape output_shape = misc::shape_calculator::compute_depth_to_space_shape(
+ input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -90,9 +92,31 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
_block_shape = block_shape;
_data_layout = input->info()->data_layout();
+ constexpr size_t dim_b = 3;
+ const auto dim_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const auto dim_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const auto dim_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_ERROR_ON(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES) != dim_b);
+
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
+ Steps steps;
+ steps.set(dim_h, block_shape);
+ steps.set(dim_w, block_shape);
+ steps.set(dim_c, output->info()->dimension(dim_c));
+
+ Window win = calculate_max_window(*output->info(), steps);
ICPPKernel::configure(win);
+
+ const auto num_batches = input->info()->tensor_shape().total_size_upper(dim_b);
+ if (num_batches > 1)
+ {
+ _split_dimension = dim_b;
+ }
+ else
+ {
+ _split_dimension = dim_h;
+ }
}
Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -102,68 +126,80 @@ Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITens
return Status{};
}
+size_t NEDepthToSpaceLayerKernel::get_split_dimension() const
+{
+ return _split_dimension;
+}
+
void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
- const int depth_size = _input->info()->dimension(idx_channel);
- const int r = (depth_size / (_block_shape * _block_shape));
- const int element_size = _input->info()->element_size();
+ const auto *input_info = _input->info();
+ const auto *output_info = _output->info();
+
+ const auto element_size = input_info->element_size();
+ const auto &input_strides = input_info->strides_in_bytes();
+ const auto &output_strides = output_info->strides_in_bytes();
+
+ const auto &input_shape = input_info->tensor_shape();
- Window slice_out = window.first_slice_window_3D();
+ const uintptr_t k_input_strides[] = {input_strides[0], input_strides[1], input_strides[2], input_strides[3]};
+ const uintptr_t k_output_strides[] = {output_strides[0], output_strides[1], output_strides[2], output_strides[3]};
- // The slice_out slice does not move
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ const uint8_t *k_input_ptr = _input->buffer();
+ uint8_t *k_output_ptr = //
+ _output->buffer() + //
+ window[3].start() * output_strides[3] + //
+ window[2].start() * output_strides[2] + //
+ window[1].start() * output_strides[1] + //
+ window[0].start() * output_strides[0];
- // Main loop for NCHW and NHWC
if (_data_layout == DataLayout::NCHW)
{
- Window slice_in = window.first_slice_window_2D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(
- slice_in,
- [&](const Coordinates &id)
- {
- const int x = id.x();
- const int y = id.y();
-
- const int z = id.z() % r;
- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
- Coordinates output_coords{out_x, out_y, z, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_2D(slice_in));
+ ARM_COMPUTE_ERROR_ON_MSG(window[2].start() != 0 || window[2].end() != window[2].step(),
+ "The window cannot be splitted in channel dimension");
+
+ const uintptr_t k_input_shape[] = {
+ window.num_iterations(0), //
+ window.num_iterations(1), //
+ input_shape[2], // The window cannot be splitted in channel dimension.
+ window.num_iterations(3) //
+ };
+
+ k_input_ptr += window[3].start() * input_strides[3] + //
+ window[2].start() * _block_shape * _block_shape * input_strides[2] + //
+ (window[1].start() / _block_shape) * input_strides[1] + //
+ (window[0].start() / _block_shape) * input_strides[0];
+
+ cpu::depth_to_space_nchw_any( //
+ k_input_ptr, k_output_ptr, //
+ k_input_shape, k_input_strides, k_output_strides, //
+ element_size, _block_shape);
}
else
{
- Window slice_in = window.first_slice_window_3D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(
- slice_in,
- [&](const Coordinates &id)
- {
- const int x = id.y();
- const int y = id.z();
-
- const int z = id.x() % r;
- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
- Coordinates output_coords{z, out_x, out_y, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_3D(slice_in));
+ ARM_COMPUTE_ERROR_ON_MSG(window[0].start() != 0 || window[0].end() != window[0].step(),
+ "The window cannot be splitted in channel dimension");
+
+ const uintptr_t k_input_shape[] = {
+ input_shape[0], // The window cannot be splitted in channel dimension.
+ window.num_iterations(1), //
+ window.num_iterations(2), //
+ window.num_iterations(3) //
+ };
+
+ k_input_ptr += window[3].start() * input_strides[3] + //
+ (window[2].start() / _block_shape) * input_strides[2] + //
+ (window[1].start() / _block_shape) * input_strides[1] + //
+ window[0].start() * _block_shape * _block_shape * input_strides[0];
+
+ cpu::depth_to_space_nhwc_any( //
+ k_input_ptr, k_output_ptr, //
+ k_input_shape, k_input_strides, k_output_strides, //
+ element_size, _block_shape);
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index 7e18dd88b8..ca431ec5fe 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
#include "src/core/NEON/INEKernel.h"
@@ -68,14 +68,18 @@ public:
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+ /** Get the dimension the scheduler should use to split. */
+ size_t get_split_dimension() const;
+
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
private:
- const ITensor *_input; /**< Source tensor */
- ITensor *_output; /**< Destination tensor */
- int32_t _block_shape; /**< Block shape */
- DataLayout _data_layout; /**< Data layout of the operation */
+ const ITensor *_input; /**< Source tensor */
+ ITensor *_output; /**< Destination tensor */
+ int32_t _block_shape; /**< Block shape */
+ DataLayout _data_layout; /**< Data layout of the operation */
+ size_t _split_dimension; /**< The dimension the scheduler should use to split the workload. */
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H