From 2d7e683e79c8ad328d4930c1f82a46827313faf4 Mon Sep 17 00:00:00 2001 From: George Wort Date: Fri, 22 Feb 2019 16:37:41 +0000 Subject: COMPMID-1694: Fuse offset contribution with the output stage when we use NEGEMMLowpMatrixMultiplyCore Change-Id: Ic1a681e4cc03e1eba3bf8485d9cdb17b3e926047 Signed-off-by: giuros01 Reviewed-on: https://review.mlplatform.org/c/561 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins --- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 39 ++-------------------- 1 file changed, 3 insertions(+), 36 deletions(-) (limited to 'src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp') diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index f0ac695b20..d3cfc7a8fa 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -86,37 +86,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen namespace arm_compute { class Coordinates; - -/* Function used by the left-over for loop to perform the quantization */ -template -inline uint8_t finalize_quantization(int32x4_t in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8_t min_u8, uint8_t max_u8) -{ - const static int32x4_t zero_s32 = vdupq_n_s32(0); - const static int32x4_t sat_value_s32 = vdupq_n_s32(255); - - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32 = vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier); - - // Round to the nearest division by a power-of-two using result_shift_s32 - in_s32 = rounding_divide_by_pow2(in_s32, result_shift); - - // Add the offset terms - in_s32 = vaddq_s32(in_s32, result_offset_after_shift_s32); - - // Saturate negative values - in_s32 = vmaxq_s32(in_s32, zero_s32); - in_s32 = vminq_s32(in_s32, sat_value_s32); - - auto out_u8 = static_cast(vgetq_lane_s32(in_s32, 0)); - - if(is_bounded_relu) - { - out_u8 = std::max(out_u8, min_u8); - out_u8 = std::min(out_u8, max_u8); - } - - return out_u8; -} } // namespace arm_compute template @@ -188,10 +157,8 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window // Add bias in_value += bias_value; - // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), - static_cast(_max)); + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max)); } }, in, out, bias); @@ -220,10 +187,10 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window // Compute left-over elements for(; x < window_end_x; ++x) { - const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast(in.ptr()) + x); + const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), static_cast(_max)); + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max)); } }, in, out); -- cgit v1.2.1