aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLReductionOperationKernel.cpp
diff options
context:
space:
mode:
authorManuel Bottini <manuel.bottini@arm.com>2019-10-21 17:59:07 +0100
committerManuel Bottini <manuel.bottini@arm.com>2019-12-03 13:58:56 +0000
commit7b9998d0fe1f98768b690ead10ebfa166d1b873d (patch)
treed3f6b81fb2e414a9e0f8ed9597eab27ef970d725 /src/core/CL/kernels/CLReductionOperationKernel.cpp
parentf9179d393a07eb9eed753e315df79d22391906c6 (diff)
downloadComputeLibrary-7b9998d0fe1f98768b690ead10ebfa166d1b873d.tar.gz
COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX
Introducing new CLArgMinMax kernel Change-Id: I0b8254207cc3859d19ceef9b6429cf5c1c586db0 Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/2202 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLReductionOperationKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp27
1 files changed, 6 insertions, 21 deletions
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index cbf3923243..91ee83e530 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -60,19 +60,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN, "Not supported reduction operation, use CLArgMinMaxLayer");
if(output->total_size() != 0)
{
- if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not supported operation for QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
@@ -81,9 +74,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
// Output tensor auto initialization if not yet initialized
- const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, !is_arg_min_max);
- const DataType output_data_type = is_arg_min_max ? DataType::S32 : input->data_type();
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, true);
+ DataType output_data_type = input->data_type();
auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
@@ -166,8 +158,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
- build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
- build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
@@ -182,8 +172,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
case ReductionOperation::MEAN_SUM:
build_opts.add_option(("-DOPERATION=sum"));
break;
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
case ReductionOperation::MIN:
case ReductionOperation::MAX:
break;
@@ -214,12 +202,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
const unsigned int width_leftover = input->info()->dimension(0) % border_val;
const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0;
- const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
kernel_axis_name = "x";
- // Set the number of WG based on the input size. If input width is < 128
- // we can use fewer threads than 8.
- lws_hint = cl::NDRange(std::min(8U, num_of_threads));
+ lws_hint = create_lws_hint_parallel_implementations(input->info()->dimension(0), border_val);
_border_size = BorderSize(0, border_width, 0, 0);
}
}