diff options
author | Omar Al Khatib <omar.alkhatib@arm.com> | 2022-12-15 09:12:12 +0000 |
---|---|---|
committer | Omar Al Khatib <omar.alkhatib@arm.com> | 2022-12-21 15:03:22 +0000 |
commit | e317baf74dd6206932877e254350b9ab913426f1 (patch) | |
tree | a46999e267957cc9623f0ab644297259b0facf60 | |
parent | 85260d8c21e7209d4777150f436b336f85812dce (diff) | |
download | ComputeLibrary-e317baf74dd6206932877e254350b9ab913426f1.tar.gz |
Optimize MeanReduce by integer acc. and removing upfront dequant.
Resolves: [COMPMID-5466]
Signed-off-by: Omar Al Khatib <omar.alkhatib@arm.com>
Change-Id: I68af0bb54580bebd2ace1fba30aa73f7f68a4dbb
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8804
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | arm_compute/runtime/NEON/functions/NEReduceMean.h | 9 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEReductionOperationKernel.cpp | 92 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEReduceMean.cpp | 45 |
3 files changed, 78 insertions, 68 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h index 7512115a3f..caaee8284a 100644 --- a/arm_compute/runtime/NEON/functions/NEReduceMean.h +++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,8 +28,6 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -93,13 +91,8 @@ private: std::vector<NEReductionOperation> _reduction_kernels; std::vector<Tensor> _reduced_outs; NEReshapeLayer _reshape; - NEDequantizationLayer _dequant; - NEQuantizationLayer _requant; int _reduction_ops; bool _keep_dims; - bool _do_requant; - Tensor _input_no_quant; - Tensor _output_no_quant; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */ diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index bfecccf94b..e0f43ab176 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -401,7 +401,8 @@ struct RedOpX Iterator input(in, in_win_no_pad); Iterator output(out, out_window); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); @@ -609,6 +610,8 @@ struct RedOpX_quantized { using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; + const auto oq_info = out->info()->quantization_info().uniform(); + const TensorInfo in_info = *(in->info()); const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); @@ -622,7 +625,19 @@ struct RedOpX_quantized Iterator input(in, in_win_no_pad); Iterator output(out, out_window); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + const float in_offset = static_cast<float>(iq_info.offset); + const float in_scale = iq_info.scale; + + const float out_offset = static_cast<float>(oq_info.offset); + const float out_scale = oq_info.scale; + + const float num_elements = static_cast<float>(in_info.dimension(0)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast<T *>(input.ptr()); @@ -844,14 +859,17 @@ struct RedOpX_quantized if(op == ReductionOperation::MEAN_SUM) { - res /= static_cast<int32_t>(in_info.dimension(0)); + const int32_t resFinal = A * (static_cast<float>(res)) + B; + + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal); } else { // Subtract accumulated offsets res -= (in_info.dimension(0) - 1) * iq_info.offset; + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res); } - *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res); + break; } default: @@ -887,7 +905,8 @@ struct RedOpYZW Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast<T *>(input.ptr()); @@ -1110,7 +1129,8 @@ struct RedOpYZW_complex Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { // Compute window_step_x elements per iteration int x = window_start_x; @@ -1169,6 +1189,8 @@ struct RedOpYZW_quantized const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; + const auto oq_info = out->info()->quantization_info().uniform(); + const int window_step_x = 16 / sizeof(T); const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); @@ -1197,7 +1219,22 @@ struct RedOpYZW_quantized vector_type_f vec_res_value3_f{}; vector_type_f vec_res_value4_f{}; - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + const float in_offset = static_cast<float>(iq_info.offset); + const float in_scale = iq_info.scale; + + const float out_offset = static_cast<float>(oq_info.offset); + const float out_scale = oq_info.scale; + + const float num_elements = static_cast<float>(in_info.dimension(axis)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + const auto vec_A = wrapper::vdup_n(static_cast<float>(A), wrapper::traits::vector_128_tag{}); + const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{}); + + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast<T *>(input.ptr()); @@ -1340,11 +1377,10 @@ struct RedOpYZW_quantized } case ReductionOperation::MEAN_SUM: { - const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{})); - vec_res_value1_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv); - vec_res_value2_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv); - vec_res_value3_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv); - vec_res_value4_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv); + vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A); + vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A); + vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A); + vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A); vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f); vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f); @@ -1389,7 +1425,9 @@ struct RedOpYZW_quantized // Compute left-over elements for(; x < window_end_x; ++x) { - float res_value = 0.f; + float res_value = 0.f; + int32_t res_value_q = 0; + switch(op) { case ReductionOperation::ARG_IDX_MAX: @@ -1419,11 +1457,15 @@ struct RedOpYZW_quantized switch(op) { case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: { res_value += *in_ptr; break; } + case ReductionOperation::MEAN_SUM: + { + res_value_q += *in_ptr; + break; + } case ReductionOperation::SUM_SQUARE: { res_value += *in_ptr * *in_ptr; @@ -1479,8 +1521,7 @@ struct RedOpYZW_quantized { case ReductionOperation::MEAN_SUM: { - int32_t res = static_cast<int32_t>(res_value); - res /= static_cast<int32_t>(in_info.dimension(axis)); + const int32_t res = A * (static_cast<float>(res_value_q)) + B; *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res); break; } @@ -1552,30 +1593,46 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi switch(axis) { case 0: + { switch(input->info()->data_type()) { case DataType::QASYMM8: + { return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op); + } case DataType::QASYMM8_SIGNED: + { return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op); + } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: + { return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); + } case DataType::S32: + { return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op); + } default: + { ARM_COMPUTE_ERROR("Not supported"); + } } + } case 1: switch(input->info()->data_type()) { case DataType::QASYMM8: + { return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op); + } case DataType::QASYMM8_SIGNED: + { return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op); + } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op); @@ -1655,7 +1712,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u if(!is_arg_min_max) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); } else diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index a2322c9eda..9c9b79a1e5 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEReduceMean.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -85,14 +83,6 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); - if(requant) - { - TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32)); - NEDequantizationLayer::validate(input, &input_no_quant); - TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32)); - NEQuantizationLayer::validate(&output_no_quant, output); - } } return Status{}; } @@ -101,8 +91,7 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax NEReduceMean::~NEReduceMean() = default; NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(), - _output_no_quant() + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() { } @@ -121,7 +110,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info(); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); @@ -129,18 +117,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, ITensor *tmp_input = input; ITensor *tmp_output = output; - if(_do_requant) - { - _memory_group.manage(&_input_no_quant); - _memory_group.manage(&_output_no_quant); - TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape); - output_no_quant_info.set_data_type(DataType::F32); - auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info); - auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32)); - _dequant.configure(input, &_input_no_quant); - tmp_input = &_input_no_quant; - tmp_output = &_output_no_quant; - } Coordinates axis_local = reduction_axis; const int input_dims = tmp_input->info()->num_dimensions(); @@ -160,7 +136,7 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); } @@ -171,7 +147,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, { _reduced_outs[i].allocator()->allocate(); } - // Configure reshape layer if we want to drop the dimensions if(!keep_dims) { @@ -186,21 +161,11 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); _reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output); } - if(_do_requant) - { - _requant.configure(&_output_no_quant, output); - _input_no_quant.allocator()->allocate(); - _output_no_quant.allocator()->allocate(); - } } void NEReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_requant) - { - _dequant.run(); - } for(auto &kernel : _reduction_kernels) { kernel.run(); @@ -209,9 +174,5 @@ void NEReduceMean::run() { _reshape.run(); } - if(_do_requant) - { - _requant.run(); - } } } // namespace arm_compute |