From 7b9998d0fe1f98768b690ead10ebfa166d1b873d Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 21 Oct 2019 17:59:07 +0100 Subject: COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX Introducing new CLArgMinMax kernel Change-Id: I0b8254207cc3859d19ceef9b6429cf5c1c586db0 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/2202 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou --- src/core/CL/kernels/CLReductionOperationKernel.cpp | 27 +++++----------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'src/core/CL/kernels/CLReductionOperationKernel.cpp') diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index cbf3923243..91ee83e530 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -60,19 +60,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN, "Not supported reduction operation, use CLArgMinMaxLayer"); if(output->total_size() != 0) { - if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not supported operation for QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } return Status{}; @@ -81,9 +74,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op) { // Output tensor auto initialization if not yet initialized - const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX); - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, !is_arg_min_max); - const DataType output_data_type = is_arg_min_max ? DataType::S32 : input->data_type(); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, true); + DataType output_data_type = input->data_type(); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16; @@ -166,8 +158,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE"); build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN"); - build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX"); - build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN"); build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD"); build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN"); build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX"); @@ -182,8 +172,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou case ReductionOperation::MEAN_SUM: build_opts.add_option(("-DOPERATION=sum")); break; - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: case ReductionOperation::MIN: case ReductionOperation::MAX: break; @@ -214,12 +202,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width)); const unsigned int width_leftover = input->info()->dimension(0) % border_val; const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0; - const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16); kernel_axis_name = "x"; - // Set the number of WG based on the input size. If input width is < 128 - // we can use fewer threads than 8. - lws_hint = cl::NDRange(std::min(8U, num_of_threads)); + lws_hint = create_lws_hint_parallel_implementations(input->info()->dimension(0), border_val); _border_size = BorderSize(0, border_width, 0, 0); } } -- cgit v1.2.1