From 47370943471c98b5ed4c954b350b925d368a810e Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Mon, 13 Nov 2023 17:20:45 +0000 Subject: Optimize CPU depth-to-space Resolves: COMPMID-6622 Signed-off-by: Viet-Hoa Do Change-Id: Ibac276618bdda125dcbb9c851c547f12739b15b4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10749 Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/BUILD.bazel | 2 + src/CMakeLists.txt | 2 + .../NEON/kernels/NEDepthToSpaceLayerKernel.cpp | 150 +++++++++++++-------- src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h | 20 +-- src/cpu/kernels/depth_to_space/list.h | 47 +++++++ src/cpu/kernels/depth_to_space/nchw/any/impl.cpp | 123 +++++++++++++++++ src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp | 105 +++++++++++++++ src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp | 17 ++- 8 files changed, 398 insertions(+), 68 deletions(-) create mode 100644 src/cpu/kernels/depth_to_space/list.h create mode 100644 src/cpu/kernels/depth_to_space/nchw/any/impl.cpp create mode 100644 src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp (limited to 'src') diff --git a/src/BUILD.bazel b/src/BUILD.bazel index c14e10c836..f0c4b52688 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -733,6 +733,8 @@ filegroup( "cpu/kernels/crop/generic/neon/fp16.cpp", "cpu/kernels/crop/generic/neon/fp32.cpp", "cpu/kernels/crop/generic/neon/integer.cpp", + "cpu/kernels/depth_to_space/nchw/any/impl.cpp", + "cpu/kernels/depth_to_space/nhwc/any/impl.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e6c6782da1..0124574765 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -724,6 +724,8 @@ target_sources( cpu/kernels/crop/generic/neon/fp16.cpp cpu/kernels/crop/generic/neon/fp32.cpp cpu/kernels/crop/generic/neon/integer.cpp + cpu/kernels/depth_to_space/nchw/any/impl.cpp + cpu/kernels/depth_to_space/nhwc/any/impl.cpp cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp index de0079ee60..e0eb5cf202 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" +#include "arm_compute/core/CoreTypes.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" @@ -31,13 +32,10 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/depth_to_space/list.h" -#include #include -using namespace arm_compute::misc::shape_calculator; - namespace arm_compute { namespace @@ -70,15 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } // namespace NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN) + : _input(nullptr), + _output(nullptr), + _block_shape(), + _data_layout(DataLayout::UNKNOWN), + _split_dimension(Window::DimY) { } void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = - compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = misc::shape_calculator::compute_depth_to_space_shape( + input->info()->tensor_shape(), input->info()->data_layout(), block_shape); // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); @@ -90,9 +92,31 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, _block_shape = block_shape; _data_layout = input->info()->data_layout(); + constexpr size_t dim_b = 3; + const auto dim_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const auto dim_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const auto dim_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_ERROR_ON(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES) != dim_b); + // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); + Steps steps; + steps.set(dim_h, block_shape); + steps.set(dim_w, block_shape); + steps.set(dim_c, output->info()->dimension(dim_c)); + + Window win = calculate_max_window(*output->info(), steps); ICPPKernel::configure(win); + + const auto num_batches = input->info()->tensor_shape().total_size_upper(dim_b); + if (num_batches > 1) + { + _split_dimension = dim_b; + } + else + { + _split_dimension = dim_h; + } } Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) @@ -102,68 +126,80 @@ Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITens return Status{}; } +size_t NEDepthToSpaceLayerKernel::get_split_dimension() const +{ + return _split_dimension; +} + void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int depth_size = _input->info()->dimension(idx_channel); - const int r = (depth_size / (_block_shape * _block_shape)); - const int element_size = _input->info()->element_size(); + const auto *input_info = _input->info(); + const auto *output_info = _output->info(); + + const auto element_size = input_info->element_size(); + const auto &input_strides = input_info->strides_in_bytes(); + const auto &output_strides = output_info->strides_in_bytes(); + + const auto &input_shape = input_info->tensor_shape(); - Window slice_out = window.first_slice_window_3D(); + const uintptr_t k_input_strides[] = {input_strides[0], input_strides[1], input_strides[2], input_strides[3]}; + const uintptr_t k_output_strides[] = {output_strides[0], output_strides[1], output_strides[2], output_strides[3]}; - // The slice_out slice does not move - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + const uint8_t *k_input_ptr = _input->buffer(); + uint8_t *k_output_ptr = // + _output->buffer() + // + window[3].start() * output_strides[3] + // + window[2].start() * output_strides[2] + // + window[1].start() * output_strides[1] + // + window[0].start() * output_strides[0]; - // Main loop for NCHW and NHWC if (_data_layout == DataLayout::NCHW) { - Window slice_in = window.first_slice_window_2D(); - do - { - Iterator in(_input, slice_in); - execute_window_loop( - slice_in, - [&](const Coordinates &id) - { - const int x = id.x(); - const int y = id.y(); - - const int z = id.z() % r; - const int out_x = x * _block_shape + (id.z() / r) % _block_shape; - const int out_y = y * _block_shape + (id.z() / r) / _block_shape; - Coordinates output_coords{out_x, out_y, z, id[3]}; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } while (window.slide_window_slice_2D(slice_in)); + ARM_COMPUTE_ERROR_ON_MSG(window[2].start() != 0 || window[2].end() != window[2].step(), + "The window cannot be splitted in channel dimension"); + + const uintptr_t k_input_shape[] = { + window.num_iterations(0), // + window.num_iterations(1), // + input_shape[2], // The window cannot be splitted in channel dimension. + window.num_iterations(3) // + }; + + k_input_ptr += window[3].start() * input_strides[3] + // + window[2].start() * _block_shape * _block_shape * input_strides[2] + // + (window[1].start() / _block_shape) * input_strides[1] + // + (window[0].start() / _block_shape) * input_strides[0]; + + cpu::depth_to_space_nchw_any( // + k_input_ptr, k_output_ptr, // + k_input_shape, k_input_strides, k_output_strides, // + element_size, _block_shape); } else { - Window slice_in = window.first_slice_window_3D(); - do - { - Iterator in(_input, slice_in); - execute_window_loop( - slice_in, - [&](const Coordinates &id) - { - const int x = id.y(); - const int y = id.z(); - - const int z = id.x() % r; - const int out_x = x * _block_shape + (id.x() / r) % _block_shape; - const int out_y = y * _block_shape + (id.x() / r) / _block_shape; - Coordinates output_coords{z, out_x, out_y, id[3]}; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } while (window.slide_window_slice_3D(slice_in)); + ARM_COMPUTE_ERROR_ON_MSG(window[0].start() != 0 || window[0].end() != window[0].step(), + "The window cannot be splitted in channel dimension"); + + const uintptr_t k_input_shape[] = { + input_shape[0], // The window cannot be splitted in channel dimension. + window.num_iterations(1), // + window.num_iterations(2), // + window.num_iterations(3) // + }; + + k_input_ptr += window[3].start() * input_strides[3] + // + (window[2].start() / _block_shape) * input_strides[2] + // + (window[1].start() / _block_shape) * input_strides[1] + // + window[0].start() * _block_shape * _block_shape * input_strides[0]; + + cpu::depth_to_space_nhwc_any( // + k_input_ptr, k_output_ptr, // + k_input_shape, k_input_strides, k_output_strides, // + element_size, _block_shape); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h index 7e18dd88b8..ca431ec5fe 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H -#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H +#ifndef ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H +#define ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H #include "src/core/NEON/INEKernel.h" @@ -68,14 +68,18 @@ public: */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + /** Get the dimension the scheduler should use to split. */ + size_t get_split_dimension() const; + // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: - const ITensor *_input; /**< Source tensor */ - ITensor *_output; /**< Destination tensor */ - int32_t _block_shape; /**< Block shape */ - DataLayout _data_layout; /**< Data layout of the operation */ + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ + int32_t _block_shape; /**< Block shape */ + DataLayout _data_layout; /**< Data layout of the operation */ + size_t _split_dimension; /**< The dimension the scheduler should use to split the workload. */ }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */ +#endif // ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H diff --git a/src/cpu/kernels/depth_to_space/list.h b/src/cpu/kernels/depth_to_space/list.h new file mode 100644 index 0000000000..9d0cd1e740 --- /dev/null +++ b/src/cpu/kernels/depth_to_space/list.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H +#define ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H + +#include + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_DEPTHTOSPACE_KERNEL(func_name) \ + void func_name(const uint8_t *src, uint8_t *dst, const uintptr_t src_shape[4], const uintptr_t src_strides[4], \ + const uintptr_t dst_strides[4], uintptr_t element_size, uintptr_t block_size) + +DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nhwc_any); +DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nchw_any); + +#undef DECLARE_DEPTHTOSPACE_KERNEL + +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H diff --git a/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp new file mode 100644 index 0000000000..0277690112 --- /dev/null +++ b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" + +#include +#include + +namespace arm_compute +{ +namespace cpu +{ + +void depth_to_space_nchw_any( // + const uint8_t *src, + uint8_t *dst, + const uintptr_t src_shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + uintptr_t element_size, + uintptr_t block_size) +{ + ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size); + ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size); + + const auto dst_channels = src_shape[2] / (block_size * block_size); + const auto src_block_col_stride = dst_channels * src_strides[2]; + const auto src_block_row_stride = block_size * dst_channels * src_strides[2]; + + auto *src_batch_ptr = src; + auto *dst_batch_ptr = dst; + + for (uintptr_t batch = 0; batch < src_shape[3]; ++batch) + { + auto *src_channel_ptr = src_batch_ptr; + auto *dst_channel_ptr = dst_batch_ptr; + + for (uintptr_t channel = 0; channel < dst_channels; ++channel) + { + auto *src_height_block_ptr = src_channel_ptr; + auto *dst_row_ptr = dst_channel_ptr; + + for (uintptr_t height_block = 0; height_block < src_shape[1]; ++height_block) + { + auto *src_block_row_ptr = src_height_block_ptr; + + for (uintptr_t block_row = 0; block_row < block_size; ++block_row) + { + auto *src_width_block_ptr = src_block_row_ptr; + auto *dst_col_ptr = dst_row_ptr; + + for (uintptr_t width_block = 0; width_block < src_shape[0]; ++width_block) + { + auto *src_block_col_ptr = src_width_block_ptr; + + for (uintptr_t block_col = 0; block_col < block_size; ++block_col) + { + // The source pointer is accumulated as: + // + // src_block_col_ptr = + // src + + // batch * dst_strides[3] + + // (channel + (block_row * block_size + block_col) * dst_channels) * src_strides[2] + + // height_block * src_strides[1] + + // width_block * element_size; + // + // The destination pointer is accumuated as: + // + // dst_col_ptr = + // dst + + // batch * dst_strides[3] + + // channel * dst_strides[2] + + // (height_block * block_size + block_row) * dst_strides[1] + + // (width_block * block_size + block_col) * element_size + + std::memcpy(dst_col_ptr, src_block_col_ptr, element_size); + + src_block_col_ptr += src_block_col_stride; + dst_col_ptr += element_size; + } + + src_width_block_ptr += element_size; + } + + src_block_row_ptr += src_block_row_stride; + dst_row_ptr += dst_strides[1]; + } + + src_height_block_ptr += src_strides[1]; + } + + src_channel_ptr += src_strides[2]; + dst_channel_ptr += dst_strides[2]; + } + + src_batch_ptr += src_strides[3]; + dst_batch_ptr += dst_strides[3]; + } +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp new file mode 100644 index 0000000000..b1c84599dc --- /dev/null +++ b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" + +#include +#include + +namespace arm_compute +{ +namespace cpu +{ + +void depth_to_space_nhwc_any( // + const uint8_t *src, + uint8_t *dst, + const uintptr_t src_shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + uintptr_t element_size, + uintptr_t block_size) +{ + ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size); + ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size); + + const auto src_block_row_stride = (src_shape[0] / block_size) * element_size; + const auto dst_width_block_stride = block_size * dst_strides[1]; + + auto *src_batch_ptr = src; + auto *dst_batch_ptr = dst; + + for (uintptr_t batch = 0; batch < src_shape[3]; ++batch) + { + auto *src_height_block_ptr = src_batch_ptr; + auto *dst_row_ptr = dst_batch_ptr; + + for (uintptr_t height_block = 0; height_block < src_shape[2]; ++height_block) + { + auto *src_block_row_ptr = src_height_block_ptr; + + for (uintptr_t block_row = 0; block_row < block_size; ++block_row) + { + auto *src_width_block_ptr = src_block_row_ptr; + auto *dst_width_block_ptr = dst_row_ptr; + + for (uintptr_t width_block = 0; width_block < src_shape[1]; ++width_block) + { + // The source pointer is accumulated as: + // + // src_width_block_ptr = + // src + + // batch * src_strides[3] + + // height_block * src_strides[2] + + // width_block * src_strides[1] + + // block_row * (src_shape[0] / block_size) * element_size; + // + // The destination pointer is accumulated as: + // + // dst_width_block_ptr = + // dst + + // batch * dst_strides[3] + + // (height_block * block_size + block_row) * dst_strides[2] + + // width_block * block_size * dst_strides[1]; + + std::memcpy(dst_width_block_ptr, src_width_block_ptr, src_block_row_stride); + + src_width_block_ptr += src_strides[1]; + dst_width_block_ptr += dst_width_block_stride; + } + + src_block_row_ptr += src_block_row_stride; + dst_row_ptr += dst_strides[2]; + } + + src_height_block_ptr += src_strides[2]; + } + + src_batch_ptr += src_strides[3]; + dst_batch_ptr += dst_strides[3]; + } +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp index 47564059ec..5eea4dca65 100644 --- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,20 @@ #include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" namespace arm_compute { +NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{} +{ +} + +NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default; + void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); @@ -47,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo { return NEDepthToSpaceLayerKernel::validate(input, output, block_shape); } + +void NEDepthToSpaceLayer::run() +{ + NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension()); +} + } // namespace arm_compute -- cgit v1.2.1