From 36debd4472839997fd3b6ec9d58530d95e3c17de Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Mon, 22 Jul 2019 11:42:55 +0100
Subject: COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX

Parallelization of reduction along x axes
Removal of the use of padding
Fast vector implementation of reduction operation

Change-Id: I3a56c57b9fc1135cf8f79d1021d966ea22b084b1
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1791
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 src/core/CL/cl_kernels/reduction_operation.cl      | 168 +++++++++++++++++++--
 src/core/CL/kernels/CLReductionOperationKernel.cpp |  81 ++++++----
 tests/benchmark/CL/ArgMinMax.cpp                   |  56 +++++++
 tests/benchmark/fixtures/ArgMinMaxFixture.h        |  84 +++++++++++
 4 files changed, 347 insertions(+), 42 deletions(-)
 create mode 100644 tests/benchmark/CL/ArgMinMax.cpp
 create mode 100644 tests/benchmark/fixtures/ArgMinMaxFixture.h

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index 5a4bb9ff4c..db034f0c3a 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -36,6 +36,10 @@
 #endif // defined(WIDTH)
 #endif // FLOAT_DATA_TYPE
 
+#if defined(DATA_TYPE)
+
+#if defined(OPERATION) && defined(WIDTH)
+
 /** Calculate square sum of a vector
  *
  * @param[in] input Pointer to the first pixel.
@@ -91,10 +95,112 @@ inline DATA_TYPE product(__global const DATA_TYPE *input)
 
     return (in.s0 * in.s1);
 }
-#if defined(OPERATION)
+
+#if defined(DATA_TYPE_OUTPUT)
+
+#if defined(ARG_MAX)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
+{
+#if defined(MULTI_ACCESS_X)
+
+    int       x_elem   = x_idx * 16;
+    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+    x_elem -= x_goback;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input - x_goback);
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    idx_sel       = (in.s01234567 > in.s89abcdef) || (in.s01234567 == in.s89abcdef && CONVERT((res.s01234567 < res.s89abcdef), VEC_DATA_TYPE(COND_DATA_TYPE, 8)));
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+    idx_sel.s0123 = (in.s0123 > in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(COND_DATA_TYPE, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+    idx_sel.s01 = (in.s01 > in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01      = select(in.s23, in.s01, idx_sel.s01);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+    idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+    return res.s0 + x_elem;
+#else  // defined(MULTI_ACCESS_X)
+
+    DATA_TYPE_OUTPUT res = 0;
+    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+    {
+        res = select(res, x_v, *(input + x_v) > *(input + res));
+    }
+
+    return res;
+#endif // defined(MULTI_ACCESS_X)
+}
+#endif // defined(ARG_MAX)
+
+#if defined(ARG_MIN)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
+{
+#if defined(MULTI_ACCESS_X)
+
+    int       x_elem   = x_idx * 16;
+    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+    x_elem -= x_goback;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input - x_goback);
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    idx_sel       = (in.s01234567 < in.s89abcdef) || (in.s01234567 == in.s89abcdef && CONVERT((res.s01234567 < res.s89abcdef), VEC_DATA_TYPE(COND_DATA_TYPE, 8)));
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+    idx_sel.s0123 = (in.s0123 < in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(COND_DATA_TYPE, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+    idx_sel.s01 = (in.s01 < in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01      = select(in.s23, in.s01, idx_sel.s01);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+    idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+    return res.s0 + x_elem;
+#else  // defined(MULTI_ACCESS_X)
+
+    DATA_TYPE_OUTPUT res = 0;
+    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+    {
+        res = select(res, x_v, *(input + x_v) < * (input + res));
+    }
+    return res;
+#endif // defined(MULTI_ACCESS_X)
+}
+#endif // defined(ARG_MIN)
+
 /** This kernel performs parallel reduction given an operation on x-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
  * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
  * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
@@ -117,7 +223,7 @@ inline DATA_TYPE product(__global const DATA_TYPE *input)
 __kernel void reduction_operation_x(
     IMAGE_DECLARATION(src),
     IMAGE_DECLARATION(partial_res),
-    __local DATA_TYPE *local_results)
+    __local DATA_TYPE_OUTPUT *local_results)
 {
     Image src         = CONVERT_TO_IMAGE_STRUCT(src);
     Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
@@ -125,9 +231,17 @@ __kernel void reduction_operation_x(
     unsigned int lsize = get_local_size(0);
     unsigned int lid   = get_local_id(0);
 
+    const uint x_idx = get_global_id(0);
+    const uint y_idx = get_global_id(1);
+
     for(unsigned int y = 0; y < get_local_size(1); ++y)
     {
-        local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+#if defined(ARG_MAX) || defined(ARG_MIN)
+        local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#else  // defined(ARG_MAX) || defined(ARG_MIN)
+        local_results[lid]                     = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Perform parallel reduction
@@ -137,9 +251,26 @@ __kernel void reduction_operation_x(
             {
 #if defined(PROD)
                 local_results[lid] *= local_results[lid + i];
-#else  // !defined(PROD)
+#elif defined(ARG_MAX)
+                __global DATA_TYPE *src_in_row = src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y;
+                DATA_TYPE           tmp0       = *(src_in_row + local_results[lid]);
+                DATA_TYPE           tmp1       = *(src_in_row + local_results[lid + i]);
+                local_results[lid]             = select(
+                                                     local_results[lid],
+                                                     local_results[lid + i],
+                                                     ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));
+
+#elif defined(ARG_MIN)
+                __global DATA_TYPE *src_in_row = src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y;
+                DATA_TYPE           tmp0       = *(src_in_row + local_results[lid]);
+                DATA_TYPE           tmp1       = *(src_in_row + local_results[lid + i]);
+                local_results[lid]             = select(
+                                                     local_results[lid],
+                                                     local_results[lid + i],
+                                                     ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+#else  // !defined(PROD) && !defined(ARG_MAX) && !defined(ARG_MIN)
                 local_results[lid] += local_results[lid + i];
-#endif // defined(PROD)
+#endif // !defined(PROD) && !defined(ARG_MAX) && !defined(ARG_MIN)
             }
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -152,16 +283,22 @@ __kernel void reduction_operation_x(
                 local_results[0] /= WIDTH;
             }
 #endif // defined(MEAN) && defined(WIDTH)
-            ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
+            ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
         }
     }
 }
-#endif // defined(OPERATION)
+
+#endif // defined(DATA_TYPE_OUTPUT)
+
+#endif // defined(OPERATION) && defined(WIDTH)
+
+#if defined(DATA_TYPE_PROMOTED)
 
 #if defined(WIDTH)
 /** This kernel performs reduction on x-axis. (Non parallel)
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the intermediate results must be passed at compile time using -DDATA_TYPE_PROMOTED: e.g. -DDATA_TYPE_PROMOTED=uint
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
@@ -191,13 +328,7 @@ __kernel void reduction_operation_non_parallel_x(
     for(unsigned int x = 1; x < WIDTH; ++x)
     {
         DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x));
-#if defined(ARG_MAX)
-        indx = select(indx, x, ISGREATER(in, res));
-        res  = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
-#elif defined(ARG_MIN)
-        indx = select(indx, x, ISLESS(in, res));
-        res  = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
-#elif defined(MIN)
+#if defined(MIN)
         res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
 #elif defined(MAX)
         res = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
@@ -226,6 +357,7 @@ __kernel void reduction_operation_non_parallel_x(
 /** This kernel performs reduction on y-axis.
  *
  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the intermediate results must be passed at compile time using -DDATA_TYPE_PROMOTED: e.g. -DDATA_TYPE_PROMOTED=uint
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
  * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
@@ -303,6 +435,7 @@ __kernel void reduction_operation_y(
 /** This kernel performs reduction on z-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the intermediate results must be passed at compile time using -DDATA_TYPE_PROMOTED: e.g. -DDATA_TYPE_PROMOTED=uint
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
@@ -400,8 +533,9 @@ __kernel void reduction_operation_z(
 /** This kernel performs reduction on w-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the intermediate results must be passed at compile time using -DDATA_TYPE_PROMOTED: e.g. -DDATA_TYPE_PROMOTED=uint
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -482,3 +616,7 @@ __kernel void reduction_operation_w(
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
 #endif /* defined(BATCH) && defined(DEPTH) */
+
+#endif /* defined(DATA_TYPE_PROMOTED) */
+
+#endif /* defined(DATA_TYPE) */
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 8e92b591d1..b26d1eeb91 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -40,8 +40,9 @@ namespace arm_compute
 {
 namespace
 {
-// OpenCL kernel requires input width to be a power of 2 for x-axis.
-constexpr unsigned int border_val = 64;
+// OpenCL kernel requires input width to be a multiple of 16 for x-axis in order to use vector operations.
+// And also to use a power of 2 to
+constexpr unsigned int border_val = 16;
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
 {
@@ -89,8 +90,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
     const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
     Window             win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool               window_changed                    = false;
-    const bool         is_serial_op                      = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN
-                                                            || op == ReductionOperation::MAX || is_data_type_quantized(input->data_type()));
+    const bool         is_serial_op                      = (op == ReductionOperation::MIN || op == ReductionOperation::MAX || is_data_type_quantized(input->data_type()));
 
     switch(axis)
     {
@@ -105,7 +105,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
             }
             else
             {
-                const unsigned int     border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
+                const unsigned int     border_width = ((input->dimension(0) % border_val) != 0 && !is_arg_min_max) ? border_val - input->dimension(0) % border_val : 0;
                 AccessWindowStatic     input_access(input, 0, 0, input->dimension(0) + border_width, 1);
                 AccessWindowHorizontal output_access(output, 0, 1);
                 window_changed = update_window_and_padding(win, input_access, output_access);
@@ -148,6 +148,8 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width));
+    auto win_config = validate_and_configure_window(input->info(), output->info(), axis, op);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     _input          = input;
     _output         = output;
@@ -184,7 +186,11 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
             build_opts.add_option(("-DOPERATION=sum"));
             break;
         case ReductionOperation::ARG_IDX_MAX:
+            build_opts.add_option(("-DOPERATION=arg_idx_max"));
+            break;
         case ReductionOperation::ARG_IDX_MIN:
+            build_opts.add_option(("-DOPERATION=arg_idx_min"));
+            break;
         case ReductionOperation::MIN:
         case ReductionOperation::MAX:
             break;
@@ -198,30 +204,56 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     // Create kernel
     cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
     std::string kernel_axis_name;
-    const bool  is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX
+    const bool  is_serial_op = (op == ReductionOperation::MIN || op == ReductionOperation::MAX
                                 || is_data_type_quantized(input->info()->data_type()));
+
+    const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     switch(axis)
     {
         case 0:
         {
+            build_opts.add_option("-DDATA_TYPE_OUTPUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+            build_opts.add_option("-DCOND_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
             if(is_serial_op)
             {
                 build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F16, "-DCOND_DATA_TYPE=short", "-DCOND_DATA_TYPE=int");
                 kernel_axis_name = "non_parallel_x";
             }
             else
             {
-                build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
-                const unsigned int width_leftover = input->info()->dimension(0) % border_val;
-                const unsigned int border_width   = (width_leftover != 0) ? border_val - width_leftover : 0;
-                const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
-                kernel_axis_name                  = "x";
-
-                // Set the number of WG based on the input size. If input width is < 128
-                // we can use fewer threads than 8.
-                lws_hint     = cl::NDRange(std::min(8U, num_of_threads));
-                _border_size = BorderSize(0, border_width, 0, 0);
+                if(op == ReductionOperation::MEAN_SUM)
+                {
+                    build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(width));
+                }
+                else
+                {
+                    build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+                }
+                kernel_axis_name = "x";
+                if(is_arg_min_max)
+                {
+                    const bool multi_access_x = (_input->info()->tensor_shape().x() > 16);
+                    build_opts.add_option_if(multi_access_x, "-DMULTI_ACCESS_X");
+
+                    const unsigned int width_leftover = input->info()->dimension(0) % 16;
+                    const unsigned int border_width   = (width_leftover != 0) ? 16 - width_leftover : 0;
+                    const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
+
+                    // Set the number of WG based on the input size. If input width is < 128
+                    // we can use fewer threads than 8  per workgroup
+                    lws_hint     = cl::NDRange(std::min(8U, num_of_threads));
+                    _border_size = BorderSize(0, 0, 0, 0);
+                }
+                else
+                {
+                    const unsigned int width_leftover = input->info()->dimension(0) % border_val;
+                    const unsigned int border_width   = (width_leftover != 0) ? border_val - width_leftover : 0;
+                    const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
+                    // Set the number of WG based on the input size. If input width is < 128
+                    // we can use fewer threads than 8 per workgroup
+                    lws_hint     = cl::NDRange(std::min(8U, num_of_threads));
+                    _border_size = BorderSize(0, border_width, 0, 0);
+                }
             }
         }
         break;
@@ -244,10 +276,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation_" + kernel_axis_name, build_opts.options()));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
     ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
 }
 
@@ -263,9 +291,8 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || _op == ReductionOperation::MIN || _op == ReductionOperation::MAX
-                               || is_data_type_quantized(_input->info()->data_type()));
+    const bool is_arg_min_max = (_op == ReductionOperation::ARG_IDX_MIN || _op == ReductionOperation::ARG_IDX_MAX);
+    const bool is_serial_op   = (_op == ReductionOperation::MIN || _op == ReductionOperation::MAX || is_data_type_quantized(_input->info()->data_type()));
     switch(_reduction_axis)
     {
         case 0:
@@ -300,11 +327,11 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                 Window out_slice = out_window.first_slice_window_2D();
 
                 // Reshape window
-                const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
+                const unsigned int border_width = ((in_slice.x().end() % border_val) != 0 && !is_arg_min_max) ? border_val - in_slice.x().end() % border_val : 0;
                 in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
 
                 // Set local sums buffer
-                unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
+                unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
                 _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
 
                 do
@@ -376,4 +403,4 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
             ARM_COMPUTE_ERROR("Not supported");
     }
 }
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/benchmark/CL/ArgMinMax.cpp b/tests/benchmark/CL/ArgMinMax.cpp
new file mode 100644
index 0000000000..25a4a05d44
--- /dev/null
+++ b/tests/benchmark/CL/ArgMinMax.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/benchmark/fixtures/ArgMinMaxFixture.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SplitDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+TEST_SUITE(CL)
+
+using CLArgMinMaxBenchmarkFixture = ArgMinMaxBenchmarkFixture<CLTensor, CLArgMinMaxLayer, CLAccessor>;
+
+REGISTER_FIXTURE_DATA_TEST_CASE(ArgMinMax, CLArgMinMaxBenchmarkFixture, framework::DatasetMode::PRECOMMIT,
+                                framework::dataset::combine(framework::dataset::combine(framework::dataset::combine(
+                                                                                            datasets::Large3DShapes(),
+                                                                                            framework::dataset::make("DataType", { DataType::F32 })),
+                                                                                        framework::dataset::make("Axis", { 0, 1 })),
+                                                            framework::dataset::make("ReductionOperation", { ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MIN })));
+
+TEST_SUITE_END() // CL
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/fixtures/ArgMinMaxFixture.h b/tests/benchmark/fixtures/ArgMinMaxFixture.h
new file mode 100644
index 0000000000..f1a0c5ab4b
--- /dev/null
+++ b/tests/benchmark/fixtures/ArgMinMaxFixture.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_ARGMINMAXFIXTURE
+#define ARM_COMPUTE_TEST_ARGMINMAXFIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/Globals.h"
+#include "tests/Utils.h"
+#include "tests/framework/Fixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Fixture that can be used for NEON and CL */
+template <typename TensorType, typename Function, typename Accessor>
+class ArgMinMaxBenchmarkFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op)
+    {
+        // Create tensors
+        src = create_tensor<TensorType>(shape, data_type);
+        dst = create_tensor<TensorType>(shape, DataType::U32);
+
+        // Create and configure function
+        argminmax_layer.configure(&src, axis, &dst, op);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+    }
+
+    void run()
+    {
+        argminmax_layer.run();
+    }
+
+    void sync()
+    {
+        sync_if_necessary<TensorType>();
+        sync_tensor_if_necessary<TensorType>(dst);
+    }
+
+    void teardown()
+    {
+        src.allocator()->free();
+        dst.allocator()->free();
+    }
+
+private:
+    TensorType src{};
+    TensorType dst{};
+    Function   argminmax_layer{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_ARGMINMAXFIXTURE */
-- 
cgit v1.2.1