From f6402dd37092c842d1de9998b23640caf12f227b Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Fri, 26 Jan 2018 15:06:19 +0000
Subject: COMPMID-834 Fix arm_compute_nightly_validation getting killed

Changed CLReductionOperationKernel: Now each kernel computes
a 2D slice instead of 1D. This reduces the memory footprint
from around 1.6Gb for a 4k input image to a few Mb, which was
caused by the __local memory and was probably the cause for this bug.

Change-Id: I71ac71ff09b041c945a134177600f0f3475e48cf
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117835
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Anthony Barbier <anthony.barbier@arm.com>
---
 src/core/CL/cl_kernels/reduction_operation.cl      | 45 +++++++++++++---------
 src/core/CL/kernels/CLReductionOperationKernel.cpp | 14 +++----
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index d46a22600f..aa7403b52b 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,39 +70,46 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input)
  * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F32
  * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
  * @param[in] partial_sum_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
- * @param[in] partial_sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] partial_sum_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] partial_sum_step_x                        partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_sum_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_sum_step_y                        partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_sums                                Local buffer for storing the partioal sum
+ * @param[in] local_sums                                Local buffer for storing the partial sum
  */
 __kernel void reduction_operation(
-    VECTOR_DECLARATION(src),
-    VECTOR_DECLARATION(partial_sum),
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(partial_sum),
     __local DATA_TYPE *local_sums)
 {
-    Vector src         = CONVERT_TO_VECTOR_STRUCT(src);
-    Vector partial_sum = CONVERT_TO_VECTOR_STRUCT(partial_sum);
+    Image src         = CONVERT_TO_IMAGE_STRUCT(src);
+    Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
 
     unsigned int lsize = get_local_size(0);
     unsigned int lid   = get_local_id(0);
 
-    local_sums[lid] = OPERATION((__global DATA_TYPE *)src.ptr);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Perform parallel reduction
-    for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
+    for(unsigned int y = 0; y < get_local_size(1); ++y)
     {
-        if(lid < i)
+        local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Perform parallel reduction
+        for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
         {
-            local_sums[lid] += local_sums[lid + i];
+            if(lid < i)
+            {
+                local_sums[lid] += local_sums[lid + i];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
 
-    if(lid == 0)
-    {
-        ((__global DATA_TYPE *)partial_sum.ptr + get_group_id(0))[0] = local_sums[0];
+        if(lid == 0)
+        {
+            ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
+        }
     }
 }
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 18a8e353d7..1dd5eb97ec 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -118,8 +118,8 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
 
     // Get first input and output slices
-    Window in_slice  = window.first_slice_window_1D();
-    Window out_slice = out_window.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
 
     // Reshape window
     const unsigned int border_width = ((in_slice.x().end() % 128) != 0) ? 128 - in_slice.x().end() % 128 : 0;
@@ -127,14 +127,14 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
 
     // Set local sums buffer
     unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
-    _kernel.setArg(num_arguments_per_1D_tensor() * 2, local_sum_size, nullptr);
+    _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
 
     do
     {
         unsigned int idx = 0;
-        add_1D_tensor_argument(idx, _input, in_slice);
-        add_1D_tensor_argument(idx, _output, out_slice);
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
         enqueue(queue, *this, in_slice, _lws_hint);
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
 }
-- 
cgit v1.2.1