From 4726fdf9f7bb9638ffbb4e358cc788686b68a414 Mon Sep 17 00:00:00 2001
From: Moritz Pflanzer <moritz.pflanzer@arm.com>
Date: Sat, 23 Sep 2017 10:47:54 +0100
Subject: COMPMID-541: Fix padding in CLMinMaxLocationKernel

Change-Id: Ie17e3f14c428553d433da2a564e016bfac7749a9
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/88881
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
---
 src/core/CL/cl_kernels/minmaxloc.cl            | 20 ++++++++++----------
 src/core/CL/kernels/CLMinMaxLocationKernel.cpp | 19 ++++++++++++-------
 src/runtime/CL/functions/CLMinMaxLocation.cpp  |  5 +++--
 3 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'src')
diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
index 05fc78dcb5..0f557a499d 100644
--- a/src/core/CL/cl_kernels/minmaxloc.cl
+++ b/src/core/CL/cl_kernels/minmaxloc.cl
@@ -45,7 +45,7 @@ inline int FloatFlip(float val)
 
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
-__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
 /** This function identifies the min and maximum value of an input image.
  *
@@ -65,7 +65,7 @@ __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
 __kernel void minmax(
     IMAGE_DECLARATION(src),
     __global int *min_max,
-    uint          width)
+    int           width)
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
@@ -76,11 +76,11 @@ __kernel void minmax(
     local_max = type_min;
 
     // Calculate min/max of row
-    uint width4 = width >> 4;
-    for(uint i = 0; i < width4; i++)
+    int i = 0;
+    for(; i + 16 <= width; i += 16)
     {
         VEC_DATA_TYPE(DATA_TYPE, 16)
-        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
         local_min = min(data, local_min);
         local_max = max(data, local_max);
     }
@@ -88,15 +88,15 @@ __kernel void minmax(
 #ifdef NON_MULTIPLE_OF_16
     // Handle non multiple of 16
     VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
 #ifdef IS_DATA_TYPE_FLOAT
-    int16 widx = convert_int16(((uint16)(width4 << 4) + idx16) < width);
+    int16 valid_indices = (i + idx16) < width;
 #else  /* IS_DATA_TYPE_FLOAT */
     VEC_DATA_TYPE(DATA_TYPE, 16)
-    widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
+    valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
 #endif /* IS_DATA_TYPE_FLOAT */
-    local_max = max(local_max, select(type_min, data, widx));
-    local_min = min(local_min, select(type_max, data, widx));
+    local_max = max(local_max, select(type_min, data, valid_indices));
+    local_min = min(local_min, select(type_max, data, valid_indices));
 #endif /* NON_MULTIPLE_OF_16 */
 
     // Perform min/max reduction
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
index be0034382c..5636592347 100644
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -32,8 +32,8 @@
 
 #include <climits>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 inline int32_t FloatFlip(float val)
 {
     static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
@@ -88,9 +88,13 @@ void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
     }
 
     // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : "");
+    std::set<std::string> build_opts{ "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
+
+    if(num_elems_processed_per_iteration % max_cl_vector_width != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
     if(input->info()->data_type() == DataType::F32)
     {
         build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(std::numeric_limits<float>::max()));
@@ -109,11 +113,11 @@ void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
     // Set fixed arguments
     unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
     _kernel.setArg(idx++, *_min_max);
-    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(input->info()->dimension(0)));
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, ceil_to_multiple(num_elems_processed_per_iteration, 16)));
     ICLKernel::configure(win);
 }
 
@@ -226,3 +230,4 @@ void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
     }
     while(window.slide_window_slice_2D(slice));
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index bc70ceb621..49dcbcb7df 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLMinMaxLocation::CLMinMaxLocation()
     : _min_max_kernel(),
       _min_max_loc_kernel(),
@@ -96,3 +96,4 @@ void CLMinMaxLocation::run()
         _max_loc->resize(max_corner_size);
     }
 }
+} // namespace arm_compute
-- 
cgit v1.2.1