From 1fac03717dab014fd202ea85a8f05b3dd475cb3c Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Fri, 30 Apr 2021 15:09:46 +0100
Subject: Fix bug on CLReductionOperation

Execution window along the X axis needs to be collapsed on the 3rd axis (rather than the 2nd) since there could be implicit padding added along the Y

Resolve COMPMID-4425

Change-Id: I9623a31749b737fea7c623cabdcfbf77cbe8f6dc
Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5551
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: TeresaARM <teresa.charlinreyes@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/reduction_operation.cl      | 22 +++++++++++++---------
 src/core/CL/kernels/CLReductionOperationKernel.cpp | 14 ++++----------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index 912b6c91a9..9f2c6e23b5 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -25,12 +25,16 @@
 #include "helpers_asymm.h"
 
 #if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) isgreater(x, y)
-#define ISLESS(x, y) isless(x, y)
+#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
+#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
+#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
+#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
 #else // !FLOAT_DATA_TYPE
 #if defined(WIDTH)
 #define ISGREATER(x, y) (x > y) ? 1 : 0
 #define ISLESS(x, y) (x < y) ? 1 : 0
+#define ISGREATER_SCALAR ISGREATER
+#define ISLESS_SCALAR ISLESS
 #else // !defined(WIDTH)
 #define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
 #define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
@@ -66,13 +70,14 @@
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void reduction_operation_x(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
 {
     int y = get_global_id(1);
+    int z = get_global_id(2);
 
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y;
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
 
 #if defined(PROD)
     DATA_TYPE res = (DATA_TYPE)1;
@@ -108,7 +113,6 @@ __kernel void reduction_operation_x(
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
- * @note In case of MIN and MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -137,9 +141,9 @@ __kernel void reduction_operation_non_parallel_x(
     {
         DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
 #if defined(MIN)
-        res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
+        res = select(res, in, ISLESS_SCALAR(in, res));
 #elif defined(MAX)
-        res = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
+        res = select(res, in, ISGREATER_SCALAR(in, res));
 #elif defined(PROD)
 #if defined(OFFSET) && defined(SCALE)
         res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 5c80f33802..133a35f513 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -159,13 +159,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
         case 0:
         {
             build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(width));
-            kernel_axis_name = "x";
-
-            if(is_serial_op)
-            {
-                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F16, "-DCOND_DATA_TYPE=short", "-DCOND_DATA_TYPE=int");
-                kernel_axis_name = "non_parallel_x";
-            }
+            kernel_axis_name = ((is_serial_op) ? "non_parallel_x" : "x");
         }
         break;
         case 1:
@@ -236,15 +230,15 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
             {
                 // Set out window
                 bool   has_collapsed = true;
-                Window window_in     = window.collapse_if_possible(window, 1, &has_collapsed);
+                Window window_in     = window.collapse_if_possible(window, 2, &has_collapsed);
                 ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
                 Window window_out = window_in;
                 window_out.set(0, Window::Dimension());
 
                 unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, window_in);
-                add_2D_tensor_argument(idx, _output, window_out);
+                add_3D_tensor_argument(idx, _input, window_in);
+                add_3D_tensor_argument(idx, _output, window_out);
                 enqueue(queue, *this, window_in);
             }
         }
-- 
cgit v1.2.1