From f29d1b7d8bf2d1619554eb3443556b44d4aa1a4c Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Tue, 29 Oct 2019 10:58:13 +0000 Subject: COMPMID-2608: Enable quantization with multiplier greater than 1 on NEON Change-Id: Ib2b0c9ac88fc2b645f478c9981f71ee28f2c77fd Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2425 Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- .../NEDepthwiseConvolutionLayerNativeKernel.cpp | 33 +++++++++++++--------- .../NEDirectConvolutionLayerOutputStageKernel.cpp | 7 +++-- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 8 ++++-- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 7 +++-- src/core/utils/quantization/AsymmHelpers.cpp | 10 +++---- 5 files changed, 39 insertions(+), 26 deletions(-) (limited to 'src/core') diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp index a9a3183c5d..aee13ee578 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp @@ -289,7 +289,16 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w acc.at(i) += *reinterpret_cast(biases_it.ptr() + i * sizeof(int32_t)); } - acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), output_multiplier.at(id.x() + i)), output_shift.at(id.x() + i)) + output_qoffset; + const int out_mul = output_multiplier.at(id.x() + i); + const int out_shift = output_shift.at(id.x() + i); + if(out_shift < 0) + { + acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset; + } out_vals[i] = static_cast(utility::clamp(acc.at(i))); } @@ -381,21 +390,20 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh if(has_biases) { - const auto biases_val = *(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); + acc.at(m) += *(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); + } - int32_t out_val = acc.at(m) + biases_val; - out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(out_val, output_multiplier.at(id.x() + m)), - output_shift.at(id.x() + m)) - + output_qoffset; - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(out_val)); + const int out_mul = output_multiplier.at(id.x() + m); + const int out_shift = output_shift.at(id.x() + m); + if(out_shift < 0) + { + acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; } else { - int32_t out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), output_multiplier.at(id.x() + m)), - output_shift.at(id.x() + m)) - + output_qoffset; - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(out_val)); + acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset; } + *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(acc.at(m))); } }, input_it, weights_it, biases_it, output_it); @@ -531,8 +539,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co int32_t out_mult = 0; int32_t out_shift = 0; const float multiplier = input_scale * weights_scale.at(i) / output_scale; - ARM_COMPUTE_ERROR_ON(multiplier > 1.f); - arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &out_mult, &out_shift); + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift); _output_multiplier.push_back(out_mult); _output_shift.push_back(out_shift); diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 4313a5e312..8834d9747a 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -38,14 +38,15 @@ #include #include -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); ARM_COMPUTE_UNUSED(result_offset_after_shift); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); @@ -53,7 +54,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con DataType::F16, DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(result_shift < 0, "Result shift must be a non negative integer"); if(bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::S32, DataType::F32); @@ -596,3 +596,4 @@ void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift); } +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index 84187332f8..86abb2d65c 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -909,8 +909,12 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); // Check if we need to clamp the result using min and max - const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound) - && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255)); + PixelValue type_min = 0; + PixelValue type_max = 0; + std::tie(type_min, type_max) = get_min_max(output->info()->data_type()); + int type_min_int = type_min.get(); + int type_max_int = type_max.get(); + const bool is_bounded_relu = !(output_stage.gemmlowp_min_bound == type_min_int && output_stage.gemmlowp_max_bound == type_max_int); // Check if we need to perform fixed point requantization const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 4906e6a987..bb0b86404e 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -39,8 +39,8 @@ #include #include -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) @@ -244,4 +244,5 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); (this->*_func)(window); -} \ No newline at end of file +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 11241e83a0..5bda746e09 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -106,10 +106,10 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, return Status{}; } -arm_compute::Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info, - const QuantizationInfo &wq_info, - const QuantizationInfo &oq_info, - GEMMLowpOutputStageInfo &stage_info) +arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, + const QuantizationInfo &wq_info, + const QuantizationInfo &oq_info, + GEMMLowpOutputStageInfo &stage_info) { ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty()); ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty()); @@ -131,7 +131,7 @@ arm_compute::Status calculate_quantized_multipliers_less_than_one(const Quantiza const float multiplier = i_scale * w_scales[i] / o_scale; int32_t quant_multiplier = 0; int32_t quant_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier_less_than_one(multiplier, &quant_multiplier, &quant_shift)); + ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier(multiplier, &quant_multiplier, &quant_shift)); quant_multipliers[i] = quant_multiplier; quant_shifts[i] = quant_shift; } -- cgit v1.2.1