diff options
author | Michalis Spyrou <michalis.spyrou@arm.com> | 2018-06-05 13:04:40 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:52:54 +0000 |
commit | 343722b441e2be9a156745db2daab305edae0db6 (patch) | |
tree | f784026a823ce6153e78a09793a54d40354a8741 /src/core/CL/kernels/CLReductionOperationKernel.cpp | |
parent | e03342e3ba78ecf5b9128339dd47c30e00cb8565 (diff) | |
download | ComputeLibrary-343722b441e2be9a156745db2daab305edae0db6.tar.gz |
COMPMID-1143 - Excessive padding in CLReductionOperationKernel when configuring window
Now max padding is equal to 15 instead of 127. If input width is less
than 128 we decrease the number of threads in the WG.
Change-Id: I5ff0b6fd8cb46143ba49e745ec9ad01f691bdd80
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134152
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLReductionOperationKernel.cpp')
-rw-r--r-- | src/core/CL/kernels/CLReductionOperationKernel.cpp | 12 |
1 files changed, 8 insertions, 4 deletions
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index 355dc7948e..c44fced3e3 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -101,14 +101,18 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); const unsigned int num_elems_processed_per_iteration = 16; - const unsigned int border_width = ((input->info()->dimension(0) % 128) != 0) ? 128 - input->info()->dimension(0) % 128 : 0; + const unsigned int border_width = ((input->info()->dimension(0) % 16) != 0) ? 16 - input->info()->dimension(0) % 16 : 0; + const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16); _input = input; _output = output; _reduction_axis = axis; _op = op; - _lws_hint = cl::NDRange(8); - _border_size = BorderSize(0, border_width, 0, 0); + + // Set the number of WG based on the input size. If input width is < 128 + // we can use fewer threads than 8. + _lws_hint = cl::NDRange(std::min(8U, num_of_threads)); + _border_size = BorderSize(0, border_width, 0, 0); // Set build options std::set<std::string> build_opts; @@ -164,7 +168,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que Window out_slice = out_window.first_slice_window_2D(); // Reshape window - const unsigned int border_width = ((in_slice.x().end() % 128) != 0) ? 128 - in_slice.x().end() % 128 : 0; + const unsigned int border_width = ((in_slice.x().end() % 16) != 0) ? 16 - in_slice.x().end() % 16 : 0; in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step())); // Set local sums buffer |