aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2020-11-11 21:05:24 +0000
committerSheri Zhang <sheri.zhang@arm.com>2020-11-13 10:34:52 +0000
commit412b789582c5992431028e9b91c4d8b99d5c4900 (patch)
tree62f7750c113dae21c877f6ed384b73c24dc4221b
parentd7341fb9e3b24b904edf7ac9d83e1e063bc77765 (diff)
downloadComputeLibrary-412b789582c5992431028e9b91c4d8b99d5c4900.tar.gz
COMPMID-3852: Fix NEReduction window
ReductionOperations splits the kernel for scheduling on the X dimension when reduction axis is > 0. By setting the execution window to be unit one in the X dimension the execution was always restricted to a single thread. Alters the window to enable multi-threading Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: Idcbe2b78957678310bb8e895969f01de972d3667 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4389 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp158
1 files changed, 79 insertions, 79 deletions
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 4e63dd95aa..3d105cc60d 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -391,15 +391,18 @@ struct RedOpX
inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
{
- const TensorInfo in_info = *(in->info());
+ const TensorInfo in_info = *(in->info());
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int>(in_window.x().start());
+ const auto window_end_x = static_cast<int>(in_window.x().end());
- Iterator input(in, in_window);
- Iterator output(out, out_window);
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(in_window.x().start());
- const auto window_end_x = static_cast<int>(in_window.x().end());
+ Window in_win_no_pad = in_window;
+ in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(in, in_win_no_pad);
+ Iterator output(out, out_window);
- execute_window_loop(in_window, [&](const Coordinates &)
+ execute_window_loop(in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
@@ -603,13 +606,17 @@ struct RedOpX_quantized
const TensorInfo in_info = *(in->info());
const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
- Iterator input(in, in_window);
- Iterator output(out, out_window);
const int window_step_x = 16 / sizeof(T);
const auto window_start_x = static_cast<int>(in_window.x().start());
const auto window_end_x = static_cast<int>(in_window.x().end());
- execute_window_loop(in_window, [&](const Coordinates &)
+ Window in_win_no_pad = in_window;
+ in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(in, in_win_no_pad);
+ Iterator output(out, out_window);
+
+ execute_window_loop(in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<T *>(input.ptr());
@@ -858,15 +865,23 @@ struct RedOpYZW
inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
{
- const TensorInfo in_info = *(in->info());
-
- Iterator input(in, in_window);
- Iterator output(out, out_window);
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(in_window.x().start());
- const auto window_end_x = static_cast<int>(in_window.x().end());
-
- execute_window_loop(in_window, [&](const Coordinates &)
+ const TensorInfo in_info = *(in->info());
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
+ const auto window_end_x_tmp = static_cast<int>(in_window.x().end());
+ // As it split over x-axis, need to set the correct spiltted window start and end.
+ const auto window_start_x = static_cast<int>(0);
+ const auto window_end_x = static_cast<int>(in_window.shape().x());
+
+ Window in_win_no_pad = in_window;
+ in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+ Window out_win_no_pad = out_window;
+ out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+ Iterator input(in, in_win_no_pad);
+ Iterator output(out, out_win_no_pad);
+
+ execute_window_loop(in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<T *>(input.ptr());
@@ -1070,18 +1085,26 @@ struct RedOpYZW_complex
inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
{
ARM_COMPUTE_ERROR_ON(axis != 2);
-
- const TensorInfo in_info = *(in->info());
-
- Iterator input(in, in_window);
- Iterator output(out, out_window);
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(in_window.x().start());
- const auto window_end_x = static_cast<int>(in_window.x().end());
-
- const size_t stride_z = in_info.strides_in_bytes()[axis];
-
- execute_window_loop(in_window, [&](const Coordinates &)
+ ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
+
+ const TensorInfo in_info = *(in->info());
+ const size_t stride_z = in_info.strides_in_bytes()[axis];
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
+ const auto window_end_x_tmp = static_cast<int>(in_window.x().end());
+ // As it split over x-axis, need to set the correct spiltted window start and end.
+ const auto window_start_x = static_cast<int>(0);
+ const auto window_end_x = static_cast<int>(in_window.shape().x());
+
+ Window in_win_no_pad = in_window;
+ in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+ Window out_win_no_pad = out_window;
+ out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+ Iterator input(in, in_win_no_pad);
+ Iterator output(out, out_win_no_pad);
+
+ execute_window_loop(in_win_no_pad, [&](const Coordinates &)
{
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -1096,29 +1119,14 @@ struct RedOpYZW_complex
T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
- T *in_ptr_0;
- T *in_ptr_1;
- switch(axis)
- {
- case 2:
- in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
- in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
+ T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+ T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+
const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
- switch(op)
- {
- case ReductionOperation::SUM:
- vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
- vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
+ vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+ vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
}
wrapper::vstore(out_ptr, vec_res_value_0);
@@ -1134,24 +1142,9 @@ struct RedOpYZW_complex
T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
- T *in_ptr;
- switch(axis)
- {
- case 2:
- in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- switch(op)
- {
- case ReductionOperation::SUM:
- res_value_0 += *in_ptr;
- res_value_1 += *(in_ptr + 1);
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
+ T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+ res_value_0 += *in_ptr;
+ res_value_1 += *(in_ptr + 1);
}
*out_ptr = res_value_0;
*(out_ptr + 1) = res_value_1;
@@ -1166,19 +1159,26 @@ struct RedOpYZW_quantized
{
inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
{
- const TensorInfo in_info = *(in->info());
+ const TensorInfo in_info = *(in->info());
+ const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
+ using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
- Iterator input(in, in_window);
- Iterator output(out, out_window);
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(in_window.x().start());
- const auto window_end_x = static_cast<int>(in_window.x().end());
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
+ const auto window_end_x_tmp = static_cast<int>(in_window.x().end());
+ // As it split over x-axis, need to set the correct spiltted window start and end.
+ const auto window_start_x = static_cast<int>(0);
+ const auto window_end_x = static_cast<int>(in_window.shape().x());
- using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+ Window in_win_no_pad = in_window;
+ in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+ Window out_win_no_pad = out_window;
+ out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
- const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
+ Iterator input(in, in_win_no_pad);
+ Iterator output(out, out_win_no_pad);
- execute_window_loop(in_window, [&](const Coordinates &)
+ execute_window_loop(in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<T *>(input.ptr());
@@ -1672,7 +1672,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
Coordinates coord;
coord.set_num_dimensions(input->info()->num_dimensions());
input->info()->set_valid_region(ValidRegion(coord, input->info()->tensor_shape()));
- Window win = calculate_max_window(*input->info(), Steps(input->info()->dimension(0)));
+ Window win = calculate_max_window(*input->info(), Steps());
INEKernel::configure(win);
// Calculate output shape and set if empty