aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAnitha Raj <anitha.raj@arm.com>2023-07-14 11:19:34 +0100
committerAnitha Raj <Anitha.Raj@arm.com>2023-08-22 09:42:32 +0000
commiteb5696d99d85e1d402188151e021bc4b14f93969 (patch)
treef78337a676d6cfbb8421ff27315b0d7ad4dffc34 /src
parente1c3b466960d5e3fd5a54871287f5eb6102bfb8c (diff)
downloadComputeLibrary-eb5696d99d85e1d402188151e021bc4b14f93969.tar.gz
Optimize CpuReshapeKernel
Resolves COMPMID-5279 Change-Id: Id9b007eed62c200702bbfcc83b94dab7b5de1714 Signed-off-by: Anitha Raj <anitha.raj@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9962 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/helpers/Utils.h31
-rw-r--r--src/cpu/kernels/CpuReshapeKernel.cpp187
-rw-r--r--src/cpu/kernels/CpuReshapeKernel.h24
-rw-r--r--src/cpu/operators/CpuReshape.cpp16
-rw-r--r--src/cpu/operators/CpuReshape.h9
5 files changed, 226 insertions, 41 deletions
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 326dc962c7..641d536c13 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h
@@ -1,5 +1,5 @@
/*
-* Copyright (c) 2020-2021 Arm Limited.
+* Copyright (c) 2020-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,6 @@
#define SRC_CORE_HELPERS_UTILS_H
#include "arm_compute/core/ITensorInfo.h"
-
namespace arm_compute
{
/** Create a strides object based on the provided strides and the tensor dimensions.
@@ -38,7 +37,7 @@ namespace arm_compute
* calculated based on the tensor shape and the strides of lower dimensions.
*/
template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixed_strides)
{
const TensorShape &shape = info.tensor_shape();
@@ -92,6 +91,32 @@ inline unsigned int get_next_power_two(unsigned int x)
return x;
}
+
+/** Check if the tensor has any holes.
+ *
+ * @param[in] info Tensor info object defining the shape of the input tensor.
+ * @param[in] dimension Highest dimension to check.
+ *
+ * @note This function checks for holes in all the dimensions upto and including the highest dimension.
+ *
+ */
+inline bool has_holes(const ITensorInfo &info, size_t dimension)
+{
+ const auto &shape = info.tensor_shape();
+ const auto &strides = info.strides_in_bytes();
+ size_t squashed_bytes = info.element_size();
+
+ for(size_t dim = 0; dim <= dimension; ++dim)
+ {
+ if(strides[dim] != squashed_bytes)
+ {
+ return true;
+ }
+ squashed_bytes *= shape[dim];
+ }
+ return false;
+}
+
} // namespace arm_compute
#endif /* SRC_CORE_HELPERS_UTILS_H */
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index 068ff07efa..a9672a8c5e 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,11 +29,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include <cstdint>
/** [NEReshapeLayerKernel Kernel] **/
@@ -61,21 +59,109 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
return Status{};
}
+
template <typename T>
-inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst)
+void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
+{
+ const TensorShape &src_shape = src->info()->tensor_shape();
+ const TensorShape &dst_shape = dst->info()->tensor_shape();
+
+ Iterator dst_it(dst, window);
+
+ execute_window_loop(window, [&](const Coordinates & dst_coord)
+ {
+ Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+ const auto output_ptr = dst->ptr_to_element(dst_coord);
+ const auto input_ptr = src->ptr_to_element(src_coord);
+
+ *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+ },
+ dst_it);
+}
+
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst )
+{
+ switch(src->info()->data_type())
+ {
+ case DataType::U8:
+ case DataType::S8:
+ case DataType::QSYMM8:
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ reshape_tensor_per_element<uint8_t>(window, src, dst);
+ break;
+ case DataType::U16:
+ case DataType::S16:
+ case DataType::F16:
+ reshape_tensor_per_element<uint16_t>(window, src, dst);
+ break;
+ case DataType::U32:
+ case DataType::S32:
+ case DataType::F32:
+ reshape_tensor_per_element<uint32_t>(window, src, dst);
+ break;
+ case DataType::U64:
+ case DataType::S64:
+ case DataType::F64:
+ reshape_tensor_per_element<uint64_t>(window, src, dst);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type!");
+ }
+}
+
+void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *dst)
{
const TensorShape &src_shape = src->info()->tensor_shape();
const TensorShape &dst_shape = dst->info()->tensor_shape();
+ Coordinates src_coord{};
Coordinates dst_coord{};
- Iterator src_it(src, window);
+ const auto element_size = dst->info()->element_size();
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const auto src_row_size = static_cast<int>(src_shape[0]);
+ const auto row_size_in_bytes = src_row_size * element_size;
- execute_window_loop(window, [&](const Coordinates & id)
+ auto output_ptr = dst->ptr_to_element(dst_coord);
+ auto input_ptr = src->ptr_to_element(src_coord);
+
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator dst_it(dst, win);
+ execute_window_loop(win, [&]( Coordinates & id)
{
- dst_coord = index2coords(dst_shape, coords2index(src_shape, id));
- *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr());
+ dst_coord = id;
+
+ for(int x = window_start_x; x < window_end_x; x += src_row_size)
+ {
+ src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+ output_ptr = dst->ptr_to_element(dst_coord);
+ input_ptr = src->ptr_to_element(src_coord);
+
+ std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+ dst_coord.increment(Window::DimX, src_row_size);
+ }
},
- src_it);
+ dst_it);
+}
+
+void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
+{
+ Iterator src_it(src, window);
+ Iterator dst_it(dst, window);
+
+ const size_t element_size = dst->info()->element_size();
+ const auto window_size = window.x().end() - window.x().start();
+ const auto window_size_in_bytes = window_size * element_size;
+
+ const auto input_ptr = src_it.ptr();
+ const auto output_ptr = dst_it.ptr();
+
+ std::memcpy(output_ptr, input_ptr, window_size_in_bytes);
}
} // namespace
@@ -83,10 +169,11 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
- ARM_COMPUTE_UNUSED(dst);
+ ARM_COMPUTE_UNUSED(src);
+ _reshape_tensor_fn = reshape_tensor_per_element_selector;
// Configure kernel window
- Window win = calculate_max_window(*src);
+ Window win = calculate_max_window(*dst);
ICpuKernel::configure(win);
}
@@ -94,7 +181,6 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
return Status{};
}
@@ -106,28 +192,7 @@ void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- switch(src->info()->data_type())
- {
- case DataType::U8:
- case DataType::S8:
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- reshape_tensor<uint8_t>(window, src, dst);
- break;
- case DataType::U16:
- case DataType::S16:
- case DataType::F16:
- reshape_tensor<uint16_t>(window, src, dst);
- break;
- case DataType::U32:
- case DataType::S32:
- case DataType::F32:
- reshape_tensor<uint32_t>(window, src, dst);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type!");
- }
+ _reshape_tensor_fn(window, src, dst);
}
const char *CpuReshapeKernel::name() const
@@ -143,6 +208,58 @@ size_t CpuReshapeKernel::get_mws(const CPUInfo &platform, size_t thread_count) c
return ICPPKernel::default_mws;
}
+void CpuReshapeKernel::prepare(ITensorPack &tensors)
+{
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ const ITensorInfo* src_info = src->info();
+ const ITensorInfo* dst_info = dst->info();
+
+ // Calculate kernel window based on the padding info
+ Window win;
+
+ const bool src_has_holes = has_holes(*src_info, src_info->num_dimensions() - 1);
+ const bool dst_has_holes = has_holes(*dst_info, dst_info->num_dimensions() - 1);
+ const bool src_has_holes_in_x = has_holes(*src_info, Window::DimX);
+ const bool dst_has_holes_in_x = has_holes(*dst_info, Window::DimX);
+ const auto src_row_size = static_cast<int>(src_info->tensor_shape()[0]);
+ const auto dst_row_size = static_cast<int>(dst_info->tensor_shape()[0]);
+
+ if(!src_has_holes && !dst_has_holes)
+ {
+ std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
+ /*
+ Copy the tensor per window. If the src and dst tensors
+ are contiguous memory allocations without any holes or
+ padding, then the tensor is squashed to 1D window and
+ we can use use a single memcopy call to copy the whole
+ window in reshape_tensor_per_window fn
+ */
+ _reshape_tensor_fn = reshape_tensor_per_window;
+ }
+ else
+ {
+ win = calculate_max_window(*dst_info);
+ /*
+ Copy tensor row by row if src and dst have no holes in X
+ dim and they have the same number of elements in their rows
+ */
+ if (!src_has_holes_in_x && !dst_has_holes_in_x && (src_row_size == dst_row_size))
+ {
+ _reshape_tensor_fn = reshape_tensor_per_row;
+ }
+ else
+ {
+ /*
+ Fall back to the element wise copy
+ */
+ _reshape_tensor_fn = reshape_tensor_per_element_selector;
+ }
+ }
+
+ ICPPKernel::configure(win);
+}
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index 17302c6731..eddbbf7135 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,6 +58,13 @@ public:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
+ /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
+ *
+ * @param[in] tensors Pack of input and output tensors
+ *
+ */
+ void prepare(ITensorPack &tensors);
+
/** Return minimum workload size of the relevant kernel
*
* @param[in] platform The CPU platform used to create the context.
@@ -66,6 +73,21 @@ public:
* @return[out] small_network_mws Minimum workload size for requsted configuration.
*/
size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+ /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+ *
+ * @return The split dimension.
+ */
+ size_t get_split_dimension() const
+ {
+ return _split_dimension;
+ }
+
+private:
+ size_t _split_dimension{ Window::DimY };
+
+ std::function<void(const Window &window, const ITensor *src, ITensor *dst )> _reshape_tensor_fn{};
+
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index 79e7b8fe6e..e6892a2e7e 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,8 @@
#include "src/common/utils/Log.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
namespace arm_compute
{
namespace cpu
@@ -43,5 +45,17 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
{
return kernels::CpuReshapeKernel::validate(src, dst);
}
+
+void CpuReshape::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ if(!_is_prepared)
+ {
+ static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
+ _is_prepared = true;
+ }
+ const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension();
+ NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 92dcb09aa9..9bc43e7db4 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_RESHAPE_H
#include "src/cpu/ICpuOperator.h"
+#include "arm_compute/core/Window.h"
namespace arm_compute
{
@@ -47,6 +48,12 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+
+private:
+ bool _is_prepared{ false } ;
};
} // namespace cpu
} // namespace arm_compute