aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorViet-Hoa Do <viet-hoa.do@arm.com>2023-11-13 17:20:45 +0000
committerViet-Hoa Do <viet-hoa.do@arm.com>2023-12-07 09:42:47 +0000
commit47370943471c98b5ed4c954b350b925d368a810e (patch)
treef5fd7d4c4a51b538874632607887b93d8ece3c07
parent17e116e90e6b962a09c133c646b6ad7884e94693 (diff)
downloadComputeLibrary-47370943471c98b5ed4c954b350b925d368a810e.tar.gz
Optimize CPU depth-to-space
Resolves: COMPMID-6622 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: Ibac276618bdda125dcbb9c851c547f12739b15b4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10749 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp2
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h23
-rw-r--r--docs/user_guide/release_version_and_change_log.dox1
-rw-r--r--filelist.json4
-rw-r--r--src/BUILD.bazel2
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp150
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h20
-rw-r--r--src/cpu/kernels/depth_to_space/list.h47
-rw-r--r--src/cpu/kernels/depth_to_space/nchw/any/impl.cpp123
-rw-r--r--src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp105
-rw-r--r--src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp17
12 files changed, 419 insertions, 77 deletions
diff --git a/Android.bp b/Android.bp
index 23b264a3a9..0502e2c954 100644
--- a/Android.bp
+++ b/Android.bp
@@ -488,6 +488,8 @@ cc_library_static {
"src/cpu/kernels/crop/generic/neon/fp16.cpp",
"src/cpu/kernels/crop/generic/neon/fp32.cpp",
"src/cpu/kernels/crop/generic/neon/integer.cpp",
+ "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp",
+ "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index c7df29a704..d27369670e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,25 +21,27 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <memory>
namespace arm_compute
{
// Forward declarations
class ITensor;
class ITensorInfo;
+class NEDepthToSpaceLayerKernel;
/** Basic function to run @ref NEDepthToSpaceLayerKernel. */
-class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
+class NEDepthToSpaceLayer : public IFunction
{
public:
/** Constructor */
- NEDepthToSpaceLayer() = default;
+ NEDepthToSpaceLayer();
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -49,7 +51,7 @@ public:
/** Prevent instances of this class from being moved (As this class contains non movable objects) */
NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete;
/** Default destructor */
- ~NEDepthToSpaceLayer() = default;
+ ~NEDepthToSpaceLayer();
/** Set the input and output tensors.
*
* Valid data layouts:
@@ -75,6 +77,11 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+ void run() override;
+
+private:
+ std::unique_ptr<NEDepthToSpaceLayerKernel> _kernel;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index ac4f0610ea..f1d3b26c0c 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -48,6 +48,7 @@ v24.01 Public major release
- Add support for FP16 in all multi_isa builds.
- Performance optimizations:
- Optimize @ref NESoftmaxLayer
+ - Optimize @ref NEDepthToSpaceLayer.
v23.11 Public major release
- New features
diff --git a/filelist.json b/filelist.json
index 60f4285a03..3fd48e669c 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1217,7 +1217,9 @@
"files": {
"common": [
"src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
- "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp"
+ "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp",
+ "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
+ "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp"
]
}
},
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index c14e10c836..f0c4b52688 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -733,6 +733,8 @@ filegroup(
"cpu/kernels/crop/generic/neon/fp16.cpp",
"cpu/kernels/crop/generic/neon/fp32.cpp",
"cpu/kernels/crop/generic/neon/integer.cpp",
+ "cpu/kernels/depth_to_space/nchw/any/impl.cpp",
+ "cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e6c6782da1..0124574765 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -724,6 +724,8 @@ target_sources(
cpu/kernels/crop/generic/neon/fp16.cpp
cpu/kernels/crop/generic/neon/fp32.cpp
cpu/kernels/crop/generic/neon/integer.cpp
+ cpu/kernels/depth_to_space/nchw/any/impl.cpp
+ cpu/kernels/depth_to_space/nhwc/any/impl.cpp
cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index de0079ee60..e0eb5cf202 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+#include "arm_compute/core/CoreTypes.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
@@ -31,13 +32,10 @@
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/depth_to_space/list.h"
-#include <arm_neon.h>
#include <cstdint>
-using namespace arm_compute::misc::shape_calculator;
-
namespace arm_compute
{
namespace
@@ -70,15 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
} // namespace
NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
+ : _input(nullptr),
+ _output(nullptr),
+ _block_shape(),
+ _data_layout(DataLayout::UNKNOWN),
+ _split_dimension(Window::DimY)
{
}
void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape =
- compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+ TensorShape output_shape = misc::shape_calculator::compute_depth_to_space_shape(
+ input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -90,9 +92,31 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
_block_shape = block_shape;
_data_layout = input->info()->data_layout();
+ constexpr size_t dim_b = 3;
+ const auto dim_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const auto dim_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const auto dim_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_ERROR_ON(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES) != dim_b);
+
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
+ Steps steps;
+ steps.set(dim_h, block_shape);
+ steps.set(dim_w, block_shape);
+ steps.set(dim_c, output->info()->dimension(dim_c));
+
+ Window win = calculate_max_window(*output->info(), steps);
ICPPKernel::configure(win);
+
+ const auto num_batches = input->info()->tensor_shape().total_size_upper(dim_b);
+ if (num_batches > 1)
+ {
+ _split_dimension = dim_b;
+ }
+ else
+ {
+ _split_dimension = dim_h;
+ }
}
Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -102,68 +126,80 @@ Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITens
return Status{};
}
+size_t NEDepthToSpaceLayerKernel::get_split_dimension() const
+{
+ return _split_dimension;
+}
+
void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
- const int depth_size = _input->info()->dimension(idx_channel);
- const int r = (depth_size / (_block_shape * _block_shape));
- const int element_size = _input->info()->element_size();
+ const auto *input_info = _input->info();
+ const auto *output_info = _output->info();
+
+ const auto element_size = input_info->element_size();
+ const auto &input_strides = input_info->strides_in_bytes();
+ const auto &output_strides = output_info->strides_in_bytes();
+
+ const auto &input_shape = input_info->tensor_shape();
- Window slice_out = window.first_slice_window_3D();
+ const uintptr_t k_input_strides[] = {input_strides[0], input_strides[1], input_strides[2], input_strides[3]};
+ const uintptr_t k_output_strides[] = {output_strides[0], output_strides[1], output_strides[2], output_strides[3]};
- // The slice_out slice does not move
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ const uint8_t *k_input_ptr = _input->buffer();
+ uint8_t *k_output_ptr = //
+ _output->buffer() + //
+ window[3].start() * output_strides[3] + //
+ window[2].start() * output_strides[2] + //
+ window[1].start() * output_strides[1] + //
+ window[0].start() * output_strides[0];
- // Main loop for NCHW and NHWC
if (_data_layout == DataLayout::NCHW)
{
- Window slice_in = window.first_slice_window_2D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(
- slice_in,
- [&](const Coordinates &id)
- {
- const int x = id.x();
- const int y = id.y();
-
- const int z = id.z() % r;
- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
- Coordinates output_coords{out_x, out_y, z, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_2D(slice_in));
+ ARM_COMPUTE_ERROR_ON_MSG(window[2].start() != 0 || window[2].end() != window[2].step(),
+ "The window cannot be splitted in channel dimension");
+
+ const uintptr_t k_input_shape[] = {
+ window.num_iterations(0), //
+ window.num_iterations(1), //
+ input_shape[2], // The window cannot be splitted in channel dimension.
+ window.num_iterations(3) //
+ };
+
+ k_input_ptr += window[3].start() * input_strides[3] + //
+ window[2].start() * _block_shape * _block_shape * input_strides[2] + //
+ (window[1].start() / _block_shape) * input_strides[1] + //
+ (window[0].start() / _block_shape) * input_strides[0];
+
+ cpu::depth_to_space_nchw_any( //
+ k_input_ptr, k_output_ptr, //
+ k_input_shape, k_input_strides, k_output_strides, //
+ element_size, _block_shape);
}
else
{
- Window slice_in = window.first_slice_window_3D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(
- slice_in,
- [&](const Coordinates &id)
- {
- const int x = id.y();
- const int y = id.z();
-
- const int z = id.x() % r;
- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
- Coordinates output_coords{z, out_x, out_y, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_3D(slice_in));
+ ARM_COMPUTE_ERROR_ON_MSG(window[0].start() != 0 || window[0].end() != window[0].step(),
+ "The window cannot be splitted in channel dimension");
+
+ const uintptr_t k_input_shape[] = {
+ input_shape[0], // The window cannot be splitted in channel dimension.
+ window.num_iterations(1), //
+ window.num_iterations(2), //
+ window.num_iterations(3) //
+ };
+
+ k_input_ptr += window[3].start() * input_strides[3] + //
+ (window[2].start() / _block_shape) * input_strides[2] + //
+ (window[1].start() / _block_shape) * input_strides[1] + //
+ window[0].start() * _block_shape * _block_shape * input_strides[0];
+
+ cpu::depth_to_space_nhwc_any( //
+ k_input_ptr, k_output_ptr, //
+ k_input_shape, k_input_strides, k_output_strides, //
+ element_size, _block_shape);
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index 7e18dd88b8..ca431ec5fe 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
#include "src/core/NEON/INEKernel.h"
@@ -68,14 +68,18 @@ public:
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+ /** Get the dimension the scheduler should use to split. */
+ size_t get_split_dimension() const;
+
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
private:
- const ITensor *_input; /**< Source tensor */
- ITensor *_output; /**< Destination tensor */
- int32_t _block_shape; /**< Block shape */
- DataLayout _data_layout; /**< Data layout of the operation */
+ const ITensor *_input; /**< Source tensor */
+ ITensor *_output; /**< Destination tensor */
+ int32_t _block_shape; /**< Block shape */
+ DataLayout _data_layout; /**< Data layout of the operation */
+ size_t _split_dimension; /**< The dimension the scheduler should use to split the workload. */
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
diff --git a/src/cpu/kernels/depth_to_space/list.h b/src/cpu/kernels/depth_to_space/list.h
new file mode 100644
index 0000000000..9d0cd1e740
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/list.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
+#define ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#define DECLARE_DEPTHTOSPACE_KERNEL(func_name) \
+ void func_name(const uint8_t *src, uint8_t *dst, const uintptr_t src_shape[4], const uintptr_t src_strides[4], \
+ const uintptr_t dst_strides[4], uintptr_t element_size, uintptr_t block_size)
+
+DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nhwc_any);
+DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nchw_any);
+
+#undef DECLARE_DEPTHTOSPACE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
diff --git a/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp
new file mode 100644
index 0000000000..0277690112
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <cstring>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void depth_to_space_nchw_any( //
+ const uint8_t *src,
+ uint8_t *dst,
+ const uintptr_t src_shape[4],
+ const uintptr_t src_strides[4],
+ const uintptr_t dst_strides[4],
+ uintptr_t element_size,
+ uintptr_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
+ ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
+
+ const auto dst_channels = src_shape[2] / (block_size * block_size);
+ const auto src_block_col_stride = dst_channels * src_strides[2];
+ const auto src_block_row_stride = block_size * dst_channels * src_strides[2];
+
+ auto *src_batch_ptr = src;
+ auto *dst_batch_ptr = dst;
+
+ for (uintptr_t batch = 0; batch < src_shape[3]; ++batch)
+ {
+ auto *src_channel_ptr = src_batch_ptr;
+ auto *dst_channel_ptr = dst_batch_ptr;
+
+ for (uintptr_t channel = 0; channel < dst_channels; ++channel)
+ {
+ auto *src_height_block_ptr = src_channel_ptr;
+ auto *dst_row_ptr = dst_channel_ptr;
+
+ for (uintptr_t height_block = 0; height_block < src_shape[1]; ++height_block)
+ {
+ auto *src_block_row_ptr = src_height_block_ptr;
+
+ for (uintptr_t block_row = 0; block_row < block_size; ++block_row)
+ {
+ auto *src_width_block_ptr = src_block_row_ptr;
+ auto *dst_col_ptr = dst_row_ptr;
+
+ for (uintptr_t width_block = 0; width_block < src_shape[0]; ++width_block)
+ {
+ auto *src_block_col_ptr = src_width_block_ptr;
+
+ for (uintptr_t block_col = 0; block_col < block_size; ++block_col)
+ {
+ // The source pointer is accumulated as:
+ //
+ // src_block_col_ptr =
+ // src +
+ // batch * dst_strides[3] +
+ // (channel + (block_row * block_size + block_col) * dst_channels) * src_strides[2] +
+ // height_block * src_strides[1] +
+ // width_block * element_size;
+ //
+ // The destination pointer is accumuated as:
+ //
+ // dst_col_ptr =
+ // dst +
+ // batch * dst_strides[3] +
+ // channel * dst_strides[2] +
+ // (height_block * block_size + block_row) * dst_strides[1] +
+ // (width_block * block_size + block_col) * element_size
+
+ std::memcpy(dst_col_ptr, src_block_col_ptr, element_size);
+
+ src_block_col_ptr += src_block_col_stride;
+ dst_col_ptr += element_size;
+ }
+
+ src_width_block_ptr += element_size;
+ }
+
+ src_block_row_ptr += src_block_row_stride;
+ dst_row_ptr += dst_strides[1];
+ }
+
+ src_height_block_ptr += src_strides[1];
+ }
+
+ src_channel_ptr += src_strides[2];
+ dst_channel_ptr += dst_strides[2];
+ }
+
+ src_batch_ptr += src_strides[3];
+ dst_batch_ptr += dst_strides[3];
+ }
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp
new file mode 100644
index 0000000000..b1c84599dc
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <cstring>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void depth_to_space_nhwc_any( //
+ const uint8_t *src,
+ uint8_t *dst,
+ const uintptr_t src_shape[4],
+ const uintptr_t src_strides[4],
+ const uintptr_t dst_strides[4],
+ uintptr_t element_size,
+ uintptr_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
+ ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
+
+ const auto src_block_row_stride = (src_shape[0] / block_size) * element_size;
+ const auto dst_width_block_stride = block_size * dst_strides[1];
+
+ auto *src_batch_ptr = src;
+ auto *dst_batch_ptr = dst;
+
+ for (uintptr_t batch = 0; batch < src_shape[3]; ++batch)
+ {
+ auto *src_height_block_ptr = src_batch_ptr;
+ auto *dst_row_ptr = dst_batch_ptr;
+
+ for (uintptr_t height_block = 0; height_block < src_shape[2]; ++height_block)
+ {
+ auto *src_block_row_ptr = src_height_block_ptr;
+
+ for (uintptr_t block_row = 0; block_row < block_size; ++block_row)
+ {
+ auto *src_width_block_ptr = src_block_row_ptr;
+ auto *dst_width_block_ptr = dst_row_ptr;
+
+ for (uintptr_t width_block = 0; width_block < src_shape[1]; ++width_block)
+ {
+ // The source pointer is accumulated as:
+ //
+ // src_width_block_ptr =
+ // src +
+ // batch * src_strides[3] +
+ // height_block * src_strides[2] +
+ // width_block * src_strides[1] +
+ // block_row * (src_shape[0] / block_size) * element_size;
+ //
+ // The destination pointer is accumulated as:
+ //
+ // dst_width_block_ptr =
+ // dst +
+ // batch * dst_strides[3] +
+ // (height_block * block_size + block_row) * dst_strides[2] +
+ // width_block * block_size * dst_strides[1];
+
+ std::memcpy(dst_width_block_ptr, src_width_block_ptr, src_block_row_stride);
+
+ src_width_block_ptr += src_strides[1];
+ dst_width_block_ptr += dst_width_block_stride;
+ }
+
+ src_block_row_ptr += src_block_row_stride;
+ dst_row_ptr += dst_strides[2];
+ }
+
+ src_height_block_ptr += src_strides[2];
+ }
+
+ src_batch_ptr += src_strides[3];
+ dst_batch_ptr += dst_strides[3];
+ }
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index 47564059ec..5eea4dca65 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,15 +25,20 @@
#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
namespace arm_compute
{
+NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{}
+{
+}
+
+NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default;
+
void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
@@ -47,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo
{
return NEDepthToSpaceLayerKernel::validate(input, output, block_shape);
}
+
+void NEDepthToSpaceLayer::run()
+{
+ NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension());
+}
+
} // namespace arm_compute