aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLReductionOperationKernel.cpp
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2018-06-05 13:04:40 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:52:54 +0000
commit343722b441e2be9a156745db2daab305edae0db6 (patch)
treef784026a823ce6153e78a09793a54d40354a8741 /src/core/CL/kernels/CLReductionOperationKernel.cpp
parente03342e3ba78ecf5b9128339dd47c30e00cb8565 (diff)
downloadComputeLibrary-343722b441e2be9a156745db2daab305edae0db6.tar.gz
COMPMID-1143 - Excessive padding in CLReductionOperationKernel when configuring window
Now max padding is equal to 15 instead of 127. If input width is less than 128 we decrease the number of threads in the WG. Change-Id: I5ff0b6fd8cb46143ba49e745ec9ad01f691bdd80 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134152 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLReductionOperationKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp12
1 files changed, 8 insertions, 4 deletions
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 355dc7948e..c44fced3e3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -101,14 +101,18 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
const unsigned int num_elems_processed_per_iteration = 16;
- const unsigned int border_width = ((input->info()->dimension(0) % 128) != 0) ? 128 - input->info()->dimension(0) % 128 : 0;
+ const unsigned int border_width = ((input->info()->dimension(0) % 16) != 0) ? 16 - input->info()->dimension(0) % 16 : 0;
+ const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
_input = input;
_output = output;
_reduction_axis = axis;
_op = op;
- _lws_hint = cl::NDRange(8);
- _border_size = BorderSize(0, border_width, 0, 0);
+
+ // Set the number of WG based on the input size. If input width is < 128
+ // we can use fewer threads than 8.
+ _lws_hint = cl::NDRange(std::min(8U, num_of_threads));
+ _border_size = BorderSize(0, border_width, 0, 0);
// Set build options
std::set<std::string> build_opts;
@@ -164,7 +168,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
Window out_slice = out_window.first_slice_window_2D();
// Reshape window
- const unsigned int border_width = ((in_slice.x().end() % 128) != 0) ? 128 - in_slice.x().end() % 128 : 0;
+ const unsigned int border_width = ((in_slice.x().end() % 16) != 0) ? 16 - in_slice.x().end() % 16 : 0;
in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
// Set local sums buffer