COMPMID-1410 (Nightly) Mismatches in CLMeanStd function for float

Change CLReductionOperation border to be multiple of 64 instead of 16. The opencl kernel works only with local_size(0) being a power of 2. This will generate a padding of 63 if input_width % 64 = 1, but I don't think it's a big issue and it keeps the border calculation pretty simple. Also, increased tolerance for fp32 because there were mismatches for the 4K image. Change-Id: Id44990a262b2d6eff4c8ce56eb7c886274d9847e Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/143415 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
author: Michalis Spyrou <michalis.spyrou@arm.com> 2018-08-08 17:12:38 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:54:54 +0000
commit: 25747e22ed65e15aef3a1a3859381cc5e2085a05 (patch)
tree: 3c7bdda8994189a83267327cd1519f8f8aacc6fc /src/core/CL/kernels/CLReductionOperationKernel.cpp
parent: 2318fcfd0dc8360126bfec71fff88a2015cbc56d (diff)
download: ComputeLibrary-25747e22ed65e15aef3a1a3859381cc5e2085a05.tar.gz
1 files changed, 7 insertions, 3 deletions
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 95967fa974..09861630ac 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -39,6 +39,9 @@ using namespace arm_compute;
 
 namespace
 {
+// OpenCL kernel requires input width to be a power of 2.
+constexpr unsigned int border_val = 64;
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_UNUSED(op);
@@ -69,7 +72,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
     const unsigned int num_elems_processed_per_iteration = 16;
 
     Window             win          = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    const unsigned int border_width = ((input->dimension(0) % num_elems_processed_per_iteration) != 0) ? num_elems_processed_per_iteration - input->dimension(0) % num_elems_processed_per_iteration : 0;
+    const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
 
     AccessWindowStatic     input_access(input, 0, 0, input->dimension(0) + border_width, 1);
     AccessWindowHorizontal output_access(output, 0, 1);
@@ -100,7 +103,8 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
     const unsigned int num_elems_processed_per_iteration = 16;
-    const unsigned int border_width                      = ((input->info()->dimension(0) % 16) != 0) ? 16 - input->info()->dimension(0) % 16 : 0;
+    const unsigned int width_leftover                    = input->info()->dimension(0) % border_val;
+    const unsigned int border_width                      = (width_leftover != 0) ? border_val - width_leftover : 0;
     const unsigned int num_of_threads                    = ((input->info()->dimension(0) + border_width) / 16);
 
     _input          = input;
@@ -163,7 +167,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     Window out_slice = out_window.first_slice_window_2D();
 
     // Reshape window
-    const unsigned int border_width = ((in_slice.x().end() % 16) != 0) ? 16 - in_slice.x().end() % 16 : 0;
+    const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
     in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
 
     // Set local sums buffer
author	Michalis Spyrou <michalis.spyrou@arm.com>	2018-08-08 17:12:38 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:54:54 +0000
commit	25747e22ed65e15aef3a1a3859381cc5e2085a05 (patch)
tree	3c7bdda8994189a83267327cd1519f8f8aacc6fc /src/core/CL/kernels/CLReductionOperationKernel.cpp
parent	2318fcfd0dc8360126bfec71fff88a2015cbc56d (diff)
download	ComputeLibrary-25747e22ed65e15aef3a1a3859381cc5e2085a05.tar.gz