From c210c85548c7f627690ed9259622d3ab342fe612 Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Mon, 9 Oct 2023 10:58:35 +0100
Subject: Optimize CL reduction operation

* Batch dimension is added to reduction operation.
  - All the dimensions higher than the batch dimension are collapsed
    so that the input and output tensors are always 3-4D.
  - CL kernel is called once instead of being repeatedly called
    to process each sliding window.

Resolves: COMPMID-6443
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: Icd99939d52d3bb648f08537e5f52ef27e894061b
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10456
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/Window.h                          |  14 ++-
 arm_compute/core/Window.inl                        |  20 +++-
 docs/user_guide/release_version_and_change_log.dox |   1 +
 .../CL/cl_kernels/common/reduction_operation.cl    |  92 ++++++++-------
 src/core/CL/kernels/CLReductionOperationKernel.cpp | 124 ++++++++++++++-------
 5 files changed, 156 insertions(+), 95 deletions(-)

diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index 4863b95045..e93d2863c9 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WINDOW_H
-#define ARM_COMPUTE_WINDOW_H
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_H
+#define ACL_ARM_COMPUTE_CORE_WINDOW_H
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -213,15 +213,17 @@ public:
      */
     void shift(size_t dimension, int shift_value);
 
-    /** Shift down all the dimensions of a window
+    /** Shift down all the dimensions of a window starting from the specified dimension.
      *
-     * i.e new_dims[n] = old_dims[n+shift_value].
+     * new_dims[i] = old_dims[i]             for all i < start_dim.
+     * new_dims[i] = old_dims[i+shift_value] for all i >= start_dim.
      *
      * @param[in] shift_value Number of dimensions to shift the window by.
+     * @param[in] start_dim   The dimension from which the dimensions start to shift.
      *
      * @return The window with the shifted dimensions.
      */
-    Window shift_dimensions(unsigned int shift_value) const;
+    Window shift_dimensions(unsigned int shift_value, unsigned int start_dim = 0) const;
 
     /** Adjust the start or end of a given dimension by the given value
      *
@@ -460,4 +462,4 @@ private:
 };
 } // namespace arm_compute
 #include "Window.inl"
-#endif /*ARM_COMPUTE_WINDOW_H */
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_H
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index d935507b1d..0f7c4fbdd7 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, 2022 Arm Limited.
+ * Copyright (c) 2016-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_INL
+#define ACL_ARM_COMPUTE_CORE_WINDOW_INL
+
 namespace arm_compute
 {
 inline Window::Window(const Window &src)
@@ -100,13 +104,21 @@ inline Window Window::collapse_if_possible(const Window &full_window,
     return collapsed;
 }
 
-inline Window Window::shift_dimensions(unsigned int shift_value) const
+inline Window Window::shift_dimensions(unsigned int shift_value, unsigned int start_dim) const
 {
     Window shifted_window;
-    for (size_t n = 0; n < (Coordinates::num_max_dimensions - shift_value); n++)
+    size_t n = 0;
+
+    for (; n < start_dim; ++n)
+    {
+        shifted_window.set(n, _dims[n]);
+    }
+
+    for (; n < (Coordinates::num_max_dimensions - shift_value); n++)
     {
         shifted_window.set(n, _dims[n + shift_value]);
     }
+
     return shifted_window;
 }
 
@@ -313,3 +325,5 @@ inline bool operator==(const Window &lhs, const Window &rhs)
     return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted);
 }
 } // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_INL
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index d1429b61d7..b2500944ca 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -56,6 +56,7 @@ v23.11 Public major release
    - Optimize @ref cpu::CpuReshape
    - Optimize @ref opencl::ClTranspose
    - Optimize @ref NEStackLayer
+   - Optimize @ref CLReductionOperation.
  - Add new OpenCL™ kernels:
    - @ref opencl::kernels::ClMatMulLowpNativeMMULKernel support for QASYMM8 and QASYMM8_SIGNED, with batch support
  - Deprecate support for Bfloat16 in @ref cpu::CpuCast.
diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
index 1cb6664078..99369be19a 100644
--- a/src/core/CL/cl_kernels/common/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -186,27 +186,28 @@ __kernel void reduction_operation_non_parallel_x(
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_y(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_z,
+    uint            output_offset_first_element_in_bytes)
 {
     int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
+    int z = get_global_id(1);
 
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y;
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * output_stride_z;
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
     res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
@@ -275,32 +276,33 @@ __kernel void reduction_operation_y(
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_w,
+    uint            output_offset_first_element_in_bytes)
 {
     int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
-    int z = get_global_id(2);
+    int w = get_global_id(2);
 
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z;
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + w * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + w * output_stride_w;
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
     res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
@@ -369,39 +371,43 @@ __kernel void reduction_operation_z(
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_stride_v                       Stride of the source tensor in V dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_stride_v                      Stride of the output tensor in V dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_stride_v,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_z,
+    uint            output_stride_v,
+    uint            output_offset_first_element_in_bytes)
 {
     int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
-    int z = get_global_id(2);
 
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z;
+    int gid_2 = get_global_id(2);
+    int z = get_global_id(2) % DEPTH;
+    int v = get_global_id(2) / DEPTH;
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z + v * input_stride_v;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z + v * output_stride_v;
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
     res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 70875a2d40..c8665f8fbd 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -204,9 +204,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     _kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(vec_size));
-    win.set(Window::DimX,
-            Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
+    TensorShape actual_input_shape = input->info()->tensor_shape();
+    actual_input_shape[0]          = width;
+
+    Window win = calculate_max_window(actual_input_shape, Steps(vec_size));
     ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
@@ -272,55 +273,92 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
         break;
         case 1:
         {
-            // Get first input and output slices
-            Window window_in{window};
-            window_in.set(Window::DimY,
-                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
-            Window in_slice  = window_in.first_slice_window_2D();
-            Window out_slice = window.first_slice_window_2D();
-
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, in_slice);
-                add_2D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+            actual_window = actual_window.shift_dimensions(1, Window::DimY);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 2:
         {
-            // Get first input and output slices
-            Window window_in{window};
-            window_in.set(Window::DimZ,
-                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
-            Window in_slice  = window_in.first_slice_window_3D();
-            Window out_slice = window.first_slice_window_3D();
-
-            do
-            {
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, _input, in_slice);
-                add_3D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+            actual_window = actual_window.shift_dimensions(1, Window::DimZ);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 3:
         {
-            // Get first input and output slices
-            Window window_in{window};
-            window_in.set(3, Window::Dimension(0, 1, 1));
-            Window in_slice  = window_in.first_slice_window_4D();
-            Window out_slice = window.first_slice_window_4D();
+            bool   has_collapsed = true;
+            Window actual_window = window.shift_dimensions(1, Window::DimW);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_4D_tensor_argument(idx, _input, in_slice);
-                add_4D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         default:
-- 
cgit v1.2.1