From 2697fd8fa42425f7bfdd60dd486d4c2132b06523 Mon Sep 17 00:00:00 2001 From: Sang-Hoon Park Date: Tue, 15 Oct 2019 16:49:24 +0100 Subject: COMPMID-2707: add keep_dims parameter to Reduction Operation The added parameter is used to decide whether or not to keep the target dimension of reduction operation. ArgMinMax operations will always remove the reduced dimension. Following things are updated to support the parameter. - [CL/NEON] functions and reference kernel - [CL/NEON] ArgMinMax function to use ReductionOperation function - [CL/NEON] validation test suite for Reduction and ArgMinMax operations to validate the added parameter - ReductionOperationFixture is modified NOT to pre-populate output tensor and now relies on underlying kernel/function. - Adjust CL validation test suite for Reduction operation to remove excessive test cases with axis values beyond input tensor's dimension. Change-Id: I3e24d276ed469a4201f323001708f0c525f11c4f Signed-off-by: Sang-Hoon Park Reviewed-on: https://review.mlplatform.org/c/2167 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Georgios Pinitas --- src/runtime/CL/functions/CLReductionOperation.cpp | 161 ++++++++++++++++++---- 1 file changed, 133 insertions(+), 28 deletions(-) (limited to 'src/runtime/CL/functions/CLReductionOperation.cpp') diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index 38f0a7523c..447c15b1e8 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -26,15 +26,17 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/Tensor.h" #include "support/ToolchainSupport.h" -using namespace arm_compute; - +namespace arm_compute +{ namespace { unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis) @@ -56,17 +58,52 @@ unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int a } // namespace CLReductionOperation::CLReductionOperation(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial() + : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(), + _is_reshape_required(false) { } -Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { - const unsigned int num_of_stages = calculate_number_of_stages(input, axis); - bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + + const unsigned int num_of_stages = calculate_number_of_stages(input, axis); + const bool is_serial = needs_serialized_reduction(op, input->data_type(), axis); + const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); + const bool is_reshape_required = !keep_dims || is_arg_min_max; + + if(is_reshape_required) + { + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); + } + + auto *output_internal = output; + + TensorInfo output_before_reshape; + const auto input_shape = input->tensor_shape(); + const auto input_data_type = input->data_type(); + const auto input_num_channles = input->num_channels(); + const auto input_qinfo = input->quantization_info(); + const auto output_data_type = is_arg_min_max ? DataType::U32 : output->data_type(); + + auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) + { + ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); + }; + + if(is_reshape_required) + { + auto shape_before_reshape = input_shape; + shape_before_reshape.set(axis, 1); + initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); + output_internal = &output_before_reshape; + } + if(is_serial) { - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op)); } else { @@ -74,14 +111,13 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf std::vector sums_vector(num_of_stages - 1); // Create intermediate tensor info - TensorShape shape{ input->tensor_shape() }; + TensorShape shape{ input_shape }; + + shape.set(0, ceil(shape.x() / 128.f)); for(unsigned int i = 0; i < num_of_stages - 1; i++) { - shape.set(0, ceil(shape.x() / 128.f)); - sums_vector[i].set_data_type(input->data_type()); - sums_vector[i].set_tensor_shape(shape); - sums_vector[i].set_num_channels(input->num_channels()); + initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo); } ReductionOperation first_kernel_op; @@ -130,17 +166,72 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf // Validate ReductionOperation on the last stage const unsigned int last_stage = num_of_stages - 1; - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0))); + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0))); + } + + if(is_reshape_required) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(output_internal, output)); } return Status{}; } -void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output) +{ + if(!_is_reshape_required && _is_serial) + { + return output; + } + + auto intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages; + const auto is_arg_min_max = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN); + + if(!_is_reshape_required) + { + --intermediate_result_vector_size; + } + + _results_vector.resize(intermediate_result_vector_size); + auto shape = input->info()->tensor_shape(); + + shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f)); + + for(auto &v : _results_vector) + { + if(&v == &_results_vector.back() && _is_reshape_required) + { + shape.set(_reduction_axis, 1); + } + v.allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + } + + if(is_arg_min_max) + { + _results_vector.back().info()->set_data_type(DataType::U32).set_is_resizable(true).reset_padding(); + } + + return _is_reshape_required ? &_results_vector.back() : output; +} + +void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { - _num_of_stages = calculate_number_of_stages(input->info(), axis); - _reduction_axis = axis; - _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0; + _op = op; + _num_of_stages = calculate_number_of_stages(input->info(), axis); + _reduction_axis = axis; + _is_serial = needs_serialized_reduction(op, input->info()->data_type(), axis); + const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); + _is_reshape_required = !keep_dims || is_arg_min_max; + + auto *output_internal = configure_intermediate_result_vector(input, output); + + // ArgMinMax might not give initialized output tensor, so initialize here. + if(_is_reshape_required) + { + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = is_arg_min_max ? DataType::U32 : input->info()->data_type(); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + } // Configure reduction operation kernels _reduction_kernels_vector.resize(_num_of_stages); @@ -148,20 +239,16 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign // Create temporary tensors if(_is_serial) { - _reduction_kernels_vector[0].configure(input, output, axis, op, 0); + if(_is_reshape_required) + { + _memory_group.manage(&_results_vector.back()); + } + + _reduction_kernels_vector[0].configure(input, output_internal, axis, op, 0); } else { _border_handlers_vector.resize(_num_of_stages); - _results_vector.resize(_num_of_stages - 1); - TensorShape shape{ input->info()->tensor_shape() }; - for(unsigned int i = 0; i < _num_of_stages - 1; i++) - { - shape.set(0, ceil(shape.x() / 128.f)); - _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); - } - - // Apply ReductionOperation only on first kernel _memory_group.manage(&_results_vector[0]); ReductionOperation first_kernel_op; @@ -262,10 +349,22 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign // Apply ReductionOperation on the last stage const unsigned int last_stage = _num_of_stages - 1; const unsigned int input_width = input->info()->dimension(0); - _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width); + + if(_is_reshape_required) + { + _memory_group.manage(&_results_vector.back()); + } + + _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width); _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue); _results_vector[last_stage - 1].allocator()->allocate(); } + + if(_is_reshape_required) + { + _reshape_kernel.configure(&_results_vector.back(), output); + _results_vector.back().allocator()->allocate(); + } } void CLReductionOperation::run() @@ -284,4 +383,10 @@ void CLReductionOperation::run() CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); } } + + if(_is_reshape_required) + { + CLScheduler::get().enqueue(_reshape_kernel, false); + } } +} // namespace arm_compute -- cgit v1.2.1