From eb5696d99d85e1d402188151e021bc4b14f93969 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Fri, 14 Jul 2023 11:19:34 +0100 Subject: Optimize CpuReshapeKernel Resolves COMPMID-5279 Change-Id: Id9b007eed62c200702bbfcc83b94dab7b5de1714 Signed-off-by: Anitha Raj Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9962 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: SiCong Li Reviewed-by: Viet-Hoa Do Benchmark: Arm Jenkins --- src/core/helpers/Utils.h | 31 +++++- src/cpu/kernels/CpuReshapeKernel.cpp | 187 ++++++++++++++++++++++++++++------- src/cpu/kernels/CpuReshapeKernel.h | 24 ++++- src/cpu/operators/CpuReshape.cpp | 16 ++- src/cpu/operators/CpuReshape.h | 9 +- 5 files changed, 226 insertions(+), 41 deletions(-) (limited to 'src') diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h index 326dc962c7..641d536c13 100644 --- a/src/core/helpers/Utils.h +++ b/src/core/helpers/Utils.h @@ -1,5 +1,5 @@ /* -* Copyright (c) 2020-2021 Arm Limited. +* Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,6 @@ #define SRC_CORE_HELPERS_UTILS_H #include "arm_compute/core/ITensorInfo.h" - namespace arm_compute { /** Create a strides object based on the provided strides and the tensor dimensions. @@ -38,7 +37,7 @@ namespace arm_compute * calculated based on the tensor shape and the strides of lower dimensions. */ template -inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides) +inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixed_strides) { const TensorShape &shape = info.tensor_shape(); @@ -92,6 +91,32 @@ inline unsigned int get_next_power_two(unsigned int x) return x; } + +/** Check if the tensor has any holes. + * + * @param[in] info Tensor info object defining the shape of the input tensor. + * @param[in] dimension Highest dimension to check. + * + * @note This function checks for holes in all the dimensions upto and including the highest dimension. + * + */ +inline bool has_holes(const ITensorInfo &info, size_t dimension) +{ + const auto &shape = info.tensor_shape(); + const auto &strides = info.strides_in_bytes(); + size_t squashed_bytes = info.element_size(); + + for(size_t dim = 0; dim <= dimension; ++dim) + { + if(strides[dim] != squashed_bytes) + { + return true; + } + squashed_bytes *= shape[dim]; + } + return false; +} + } // namespace arm_compute #endif /* SRC_CORE_HELPERS_UTILS_H */ diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp index 068ff07efa..a9672a8c5e 100644 --- a/src/cpu/kernels/CpuReshapeKernel.cpp +++ b/src/cpu/kernels/CpuReshapeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,11 +29,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" #include "src/core/NEON/INEKernel.h" -#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" - #include /** [NEReshapeLayerKernel Kernel] **/ @@ -61,21 +59,109 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) return Status{}; } + template -inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst) +void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst) +{ + const TensorShape &src_shape = src->info()->tensor_shape(); + const TensorShape &dst_shape = dst->info()->tensor_shape(); + + Iterator dst_it(dst, window); + + execute_window_loop(window, [&](const Coordinates & dst_coord) + { + Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + const auto output_ptr = dst->ptr_to_element(dst_coord); + const auto input_ptr = src->ptr_to_element(src_coord); + + *reinterpret_cast(output_ptr) = *reinterpret_cast(input_ptr); + }, + dst_it); +} + +void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst ) +{ + switch(src->info()->data_type()) + { + case DataType::U8: + case DataType::S8: + case DataType::QSYMM8: + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + reshape_tensor_per_element(window, src, dst); + break; + case DataType::U16: + case DataType::S16: + case DataType::F16: + reshape_tensor_per_element(window, src, dst); + break; + case DataType::U32: + case DataType::S32: + case DataType::F32: + reshape_tensor_per_element(window, src, dst); + break; + case DataType::U64: + case DataType::S64: + case DataType::F64: + reshape_tensor_per_element(window, src, dst); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type!"); + } +} + +void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *dst) { const TensorShape &src_shape = src->info()->tensor_shape(); const TensorShape &dst_shape = dst->info()->tensor_shape(); + Coordinates src_coord{}; Coordinates dst_coord{}; - Iterator src_it(src, window); + const auto element_size = dst->info()->element_size(); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const auto src_row_size = static_cast(src_shape[0]); + const auto row_size_in_bytes = src_row_size * element_size; - execute_window_loop(window, [&](const Coordinates & id) + auto output_ptr = dst->ptr_to_element(dst_coord); + auto input_ptr = src->ptr_to_element(src_coord); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator dst_it(dst, win); + execute_window_loop(win, [&]( Coordinates & id) { - dst_coord = index2coords(dst_shape, coords2index(src_shape, id)); - *reinterpret_cast(dst->ptr_to_element(dst_coord)) = *reinterpret_cast(src_it.ptr()); + dst_coord = id; + + for(int x = window_start_x; x < window_end_x; x += src_row_size) + { + src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + output_ptr = dst->ptr_to_element(dst_coord); + input_ptr = src->ptr_to_element(src_coord); + + std::memcpy(output_ptr, input_ptr, row_size_in_bytes); + + dst_coord.increment(Window::DimX, src_row_size); + } }, - src_it); + dst_it); +} + +void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst) +{ + Iterator src_it(src, window); + Iterator dst_it(dst, window); + + const size_t element_size = dst->info()->element_size(); + const auto window_size = window.x().end() - window.x().start(); + const auto window_size_in_bytes = window_size * element_size; + + const auto input_ptr = src_it.ptr(); + const auto output_ptr = dst_it.ptr(); + + std::memcpy(output_ptr, input_ptr, window_size_in_bytes); } } // namespace @@ -83,10 +169,11 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_UNUSED(src); + _reshape_tensor_fn = reshape_tensor_per_element_selector; // Configure kernel window - Window win = calculate_max_window(*src); + Window win = calculate_max_window(*dst); ICpuKernel::configure(win); } @@ -94,7 +181,6 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - return Status{}; } @@ -106,28 +192,7 @@ void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - - switch(src->info()->data_type()) - { - case DataType::U8: - case DataType::S8: - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - reshape_tensor(window, src, dst); - break; - case DataType::U16: - case DataType::S16: - case DataType::F16: - reshape_tensor(window, src, dst); - break; - case DataType::U32: - case DataType::S32: - case DataType::F32: - reshape_tensor(window, src, dst); - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type!"); - } + _reshape_tensor_fn(window, src, dst); } const char *CpuReshapeKernel::name() const @@ -143,6 +208,58 @@ size_t CpuReshapeKernel::get_mws(const CPUInfo &platform, size_t thread_count) c return ICPPKernel::default_mws; } +void CpuReshapeKernel::prepare(ITensorPack &tensors) +{ + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const ITensorInfo* src_info = src->info(); + const ITensorInfo* dst_info = dst->info(); + + // Calculate kernel window based on the padding info + Window win; + + const bool src_has_holes = has_holes(*src_info, src_info->num_dimensions() - 1); + const bool dst_has_holes = has_holes(*dst_info, dst_info->num_dimensions() - 1); + const bool src_has_holes_in_x = has_holes(*src_info, Window::DimX); + const bool dst_has_holes_in_x = has_holes(*dst_info, Window::DimX); + const auto src_row_size = static_cast(src_info->tensor_shape()[0]); + const auto dst_row_size = static_cast(dst_info->tensor_shape()[0]); + + if(!src_has_holes && !dst_has_holes) + { + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info); + /* + Copy the tensor per window. If the src and dst tensors + are contiguous memory allocations without any holes or + padding, then the tensor is squashed to 1D window and + we can use use a single memcopy call to copy the whole + window in reshape_tensor_per_window fn + */ + _reshape_tensor_fn = reshape_tensor_per_window; + } + else + { + win = calculate_max_window(*dst_info); + /* + Copy tensor row by row if src and dst have no holes in X + dim and they have the same number of elements in their rows + */ + if (!src_has_holes_in_x && !dst_has_holes_in_x && (src_row_size == dst_row_size)) + { + _reshape_tensor_fn = reshape_tensor_per_row; + } + else + { + /* + Fall back to the element wise copy + */ + _reshape_tensor_fn = reshape_tensor_per_element_selector; + } + } + + ICPPKernel::configure(win); +} } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h index 17302c6731..eddbbf7135 100644 --- a/src/cpu/kernels/CpuReshapeKernel.h +++ b/src/cpu/kernels/CpuReshapeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -58,6 +58,13 @@ public: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; + /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes + * + * @param[in] tensors Pack of input and output tensors + * + */ + void prepare(ITensorPack &tensors); + /** Return minimum workload size of the relevant kernel * * @param[in] platform The CPU platform used to create the context. @@ -66,6 +73,21 @@ public: * @return[out] small_network_mws Minimum workload size for requsted configuration. */ size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. + * + * @return The split dimension. + */ + size_t get_split_dimension() const + { + return _split_dimension; + } + +private: + size_t _split_dimension{ Window::DimY }; + + std::function _reshape_tensor_fn{}; + }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp index 79e7b8fe6e..e6892a2e7e 100644 --- a/src/cpu/operators/CpuReshape.cpp +++ b/src/cpu/operators/CpuReshape.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,8 @@ #include "src/common/utils/Log.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + namespace arm_compute { namespace cpu @@ -43,5 +45,17 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) { return kernels::CpuReshapeKernel::validate(src, dst); } + +void CpuReshape::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + if(!_is_prepared) + { + static_cast(_kernel.get())->prepare(tensors); + _is_prepared = true; + } + const auto split_dimension = static_cast(_kernel.get())->get_split_dimension(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h index 92dcb09aa9..9bc43e7db4 100644 --- a/src/cpu/operators/CpuReshape.h +++ b/src/cpu/operators/CpuReshape.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_RESHAPE_H #include "src/cpu/ICpuOperator.h" +#include "arm_compute/core/Window.h" namespace arm_compute { @@ -47,6 +48,12 @@ public: * @return a status */ static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + bool _is_prepared{ false } ; }; } // namespace cpu } // namespace arm_compute -- cgit v1.2.1