From 7b9998d0fe1f98768b690ead10ebfa166d1b873d Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Mon, 21 Oct 2019 17:59:07 +0100
Subject: COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX

Introducing new CLArgMinMax kernel

Change-Id: I0b8254207cc3859d19ceef9b6429cf5c1c586db0
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2202
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
---
 src/core/CL/kernels/CLReductionOperationKernel.cpp | 27 +++++-----------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'src/core/CL/kernels/CLReductionOperationKernel.cpp')

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index cbf3923243..91ee83e530 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -60,19 +60,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
     ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN, "Not supported reduction operation, use CLArgMinMaxLayer");
 
     if(output->total_size() != 0)
     {
-        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not supported operation for QASYMM8");
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};
@@ -81,9 +74,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     // Output tensor auto initialization if not yet initialized
-    const bool        is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
-    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, !is_arg_min_max);
-    const DataType    output_data_type = is_arg_min_max ? DataType::S32 : input->data_type();
+    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, true);
+    DataType          output_data_type = input->data_type();
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
 
     const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
@@ -166,8 +158,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
-    build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
-    build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
@@ -182,8 +172,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
         case ReductionOperation::MEAN_SUM:
             build_opts.add_option(("-DOPERATION=sum"));
             break;
-        case ReductionOperation::ARG_IDX_MAX:
-        case ReductionOperation::ARG_IDX_MIN:
         case ReductionOperation::MIN:
         case ReductionOperation::MAX:
             break;
@@ -214,12 +202,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
                 build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
                 const unsigned int width_leftover = input->info()->dimension(0) % border_val;
                 const unsigned int border_width   = (width_leftover != 0) ? border_val - width_leftover : 0;
-                const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
                 kernel_axis_name                  = "x";
 
-                // Set the number of WG based on the input size. If input width is < 128
-                // we can use fewer threads than 8.
-                lws_hint     = cl::NDRange(std::min(8U, num_of_threads));
+                lws_hint     = create_lws_hint_parallel_implementations(input->info()->dimension(0), border_val);
                 _border_size = BorderSize(0, border_width, 0, 0);
             }
         }
-- 
cgit v1.2.1