From e317baf74dd6206932877e254350b9ab913426f1 Mon Sep 17 00:00:00 2001 From: Omar Al Khatib Date: Thu, 15 Dec 2022 09:12:12 +0000 Subject: Optimize MeanReduce by integer acc. and removing upfront dequant. Resolves: [COMPMID-5466] Signed-off-by: Omar Al Khatib Change-Id: I68af0bb54580bebd2ace1fba30aa73f7f68a4dbb Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8804 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins --- .../NEON/kernels/NEReductionOperationKernel.cpp | 92 +++++++++++++++++----- 1 file changed, 74 insertions(+), 18 deletions(-) (limited to 'src/core/NEON/kernels') diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index bfecccf94b..e0f43ab176 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -401,7 +401,8 @@ struct RedOpX Iterator input(in, in_win_no_pad); Iterator output(out, out_window); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast(input.ptr()); @@ -609,6 +610,8 @@ struct RedOpX_quantized { using PromotedType = typename wrapper::traits::promote::type>::type; + const auto oq_info = out->info()->quantization_info().uniform(); + const TensorInfo in_info = *(in->info()); const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); @@ -622,7 +625,19 @@ struct RedOpX_quantized Iterator input(in, in_win_no_pad); Iterator output(out, out_window); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + const float in_offset = static_cast(iq_info.offset); + const float in_scale = iq_info.scale; + + const float out_offset = static_cast(oq_info.offset); + const float out_scale = oq_info.scale; + + const float num_elements = static_cast(in_info.dimension(0)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast(input.ptr()); @@ -844,14 +859,17 @@ struct RedOpX_quantized if(op == ReductionOperation::MEAN_SUM) { - res /= static_cast(in_info.dimension(0)); + const int32_t resFinal = A * (static_cast(res)) + B; + + *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(resFinal); } else { // Subtract accumulated offsets res -= (in_info.dimension(0) - 1) * iq_info.offset; + *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(res); } - *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(res); + break; } default: @@ -887,7 +905,8 @@ struct RedOpYZW Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast(input.ptr()); @@ -1110,7 +1129,8 @@ struct RedOpYZW_complex Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { // Compute window_step_x elements per iteration int x = window_start_x; @@ -1169,6 +1189,8 @@ struct RedOpYZW_quantized const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); using PromotedType = typename wrapper::traits::promote::type>::type; + const auto oq_info = out->info()->quantization_info().uniform(); + const int window_step_x = 16 / sizeof(T); const auto window_start_x_tmp = static_cast(in_window.x().start()); const auto window_end_x_tmp = static_cast(in_window.x().end()); @@ -1197,7 +1219,22 @@ struct RedOpYZW_quantized vector_type_f vec_res_value3_f{}; vector_type_f vec_res_value4_f{}; - execute_window_loop(in_win_no_pad, [&](const Coordinates &) + const float in_offset = static_cast(iq_info.offset); + const float in_scale = iq_info.scale; + + const float out_offset = static_cast(oq_info.offset); + const float out_scale = oq_info.scale; + + const float num_elements = static_cast(in_info.dimension(axis)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + const auto vec_A = wrapper::vdup_n(static_cast(A), wrapper::traits::vector_128_tag{}); + const auto vec_B = wrapper::vdup_n(static_cast(B), wrapper::traits::vector_128_tag{}); + + execute_window_loop( + in_win_no_pad, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast(input.ptr()); @@ -1340,11 +1377,10 @@ struct RedOpYZW_quantized } case ReductionOperation::MEAN_SUM: { - const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast(in_info.dimension(axis)), wrapper::traits::vector_128_tag{})); - vec_res_value1_f = wrapper::vmul(wrapper::vcvt(vec_res_value1), vec_width_inv); - vec_res_value2_f = wrapper::vmul(wrapper::vcvt(vec_res_value2), vec_width_inv); - vec_res_value3_f = wrapper::vmul(wrapper::vcvt(vec_res_value3), vec_width_inv); - vec_res_value4_f = wrapper::vmul(wrapper::vcvt(vec_res_value4), vec_width_inv); + vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value1), vec_A); + vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value2), vec_A); + vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value3), vec_A); + vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value4), vec_A); vec_res_value1 = wrapper::vcvt(vec_res_value1_f); vec_res_value2 = wrapper::vcvt(vec_res_value2_f); @@ -1389,7 +1425,9 @@ struct RedOpYZW_quantized // Compute left-over elements for(; x < window_end_x; ++x) { - float res_value = 0.f; + float res_value = 0.f; + int32_t res_value_q = 0; + switch(op) { case ReductionOperation::ARG_IDX_MAX: @@ -1419,11 +1457,15 @@ struct RedOpYZW_quantized switch(op) { case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: { res_value += *in_ptr; break; } + case ReductionOperation::MEAN_SUM: + { + res_value_q += *in_ptr; + break; + } case ReductionOperation::SUM_SQUARE: { res_value += *in_ptr * *in_ptr; @@ -1479,8 +1521,7 @@ struct RedOpYZW_quantized { case ReductionOperation::MEAN_SUM: { - int32_t res = static_cast(res_value); - res /= static_cast(in_info.dimension(axis)); + const int32_t res = A * (static_cast(res_value_q)) + B; *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res); break; } @@ -1552,30 +1593,46 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi switch(axis) { case 0: + { switch(input->info()->data_type()) { case DataType::QASYMM8: + { return Reducer>::reduceX(window, input, output, RedOpX_quantized(), op); + } case DataType::QASYMM8_SIGNED: + { return Reducer>::reduceX(window, input, output, RedOpX_quantized(), op); + } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: return Reducer>::reduceX(window, input, output, RedOpX(), op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: + { return Reducer>::reduceX(window, input, output, RedOpX(), op); + } case DataType::S32: + { return Reducer>::reduceX(window, input, output, RedOpX(), op); + } default: + { ARM_COMPUTE_ERROR("Not supported"); + } } + } case 1: switch(input->info()->data_type()) { case DataType::QASYMM8: + { return Reducer>::reduceY(window, input, output, RedOpYZW_quantized(), op); + } case DataType::QASYMM8_SIGNED: + { return Reducer>::reduceY(window, input, output, RedOpYZW_quantized(), op); + } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: return Reducer>::reduceY(window, input, output, RedOpYZW(), op); @@ -1655,7 +1712,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u if(!is_arg_min_max) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); } else -- cgit v1.2.1