From 5979b217ee887c4442e1affa7dc4378a4a642b95 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Thu, 6 Feb 2020 11:58:51 +0000 Subject: COMPMID-3087: Fails NEPoolingLayer on Nightlies #688 Using a single requantization functions instead of dequantizing and quantizing every time Change-Id: Ie62e4299d9e1dc2ae95fc742ef3ee3bb17cd4c78 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2693 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 172 +++++++++++++++++++------ tests/validation/NEON/PoolingLayer.cpp | 5 +- 2 files changed, 131 insertions(+), 46 deletions(-) diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index 36116d20ec..349e64640c 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -324,64 +324,122 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } template -inline T vquantize_q8(const float32x4x2_t &qv, const UniformQuantizationInfo &qi); +inline T vcvtq_q32_f32(float32x4_t values); template <> -inline uint8x8_t vquantize_q8(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) +inline uint32x4_t vcvtq_q32_f32(float32x4_t values) { - return vquantize(qv, qi); +#ifdef __aarch64__ + return vcvtnq_u32_f32(values); +#else //__aarch64__ + return vcvtq_u32_f32(values); +#endif //__aarch64__ } template <> -inline int8x8_t vquantize_q8(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) +inline int32x4_t vcvtq_q32_f32(float32x4_t values) { - return vquantize_signed(qv, qi); +#ifdef __aarch64__ + return vcvtnq_s32_f32(values); +#else //__aarch64__ + return vcvtq_s32_f32(values); +#endif //__aarch64__ } template -inline T vquantize_q8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); +inline float32x4_t vcvtq_f32_q32(T values); template <> -inline uint8x16_t vquantize_q8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +inline float32x4_t vcvtq_f32_q32(uint32x4_t values) { - return vquantize(qv, qi); + return vcvtq_f32_u32(values); } template <> -inline int8x16_t vquantize_q8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +inline float32x4_t vcvtq_f32_q32(int32x4_t values) { - return vquantize_signed(qv, qi); + return vcvtq_f32_s32(values); } -template -inline T vcvtq_q32_f32(float32x4_t values); +template +inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset); template <> -inline uint32x4_t vcvtq_q32_f32(float32x4_t values) +inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) { - return vcvtq_u32_f32(values); + const float new_scale = quant_rescale / scale_pooling; + return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset)); } template <> -inline int32x4_t vcvtq_q32_f32(float32x4_t values) +inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) { - return vcvtq_s32_f32(values); + const float new_scale = quant_rescale / scale_pooling; + return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset)); +} + +template +inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo); + +template <> +inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) +{ + const float32x4x4_t acc = + { + { + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))), + } + }; + return vquantize(acc, requant_qinfo); +} + +template <> +inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) +{ + const float32x4x4_t acc = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))), + } + }; + return vquantize_signed(acc, requant_qinfo); } template -inline float32x4_t vcvtq_f32_q32(T values); +inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo); template <> -inline float32x4_t vcvtq_f32_q32(uint32x4_t values) +inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) { - return vcvtq_f32_u32(values); + const float32x4x2_t acc = + { + { + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))), + } + }; + return vquantize(acc, requant_qinfo); } template <> -inline float32x4_t vcvtq_f32_q32(int32x4_t values) +inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) { - return vcvtq_f32_s32(values); + const float32x4x2_t acc = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))), + } + }; + return vquantize_signed(acc, requant_qinfo); } + } // namespace NEPoolingLayerKernel::NEPoolingLayerKernel() @@ -677,6 +735,10 @@ void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Wi const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform(); const bool have_different_qinfo = input_qinfo != output_qinfo; + const float requant_scale = output_qinfo.scale / input_qinfo.scale; + const int32_t requant_offset = output_qinfo.offset - static_cast(static_cast(input_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + execute_window_loop(window, [&](const Coordinates & id) { const auto top_data = wrapper::vloadq(input_top_ptr + input.offset()); @@ -727,7 +789,7 @@ void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Wi wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1]))); - // Scale lower result + // Scale upper result scale_vector_q16x8(exclude_padding, res_upper, id, 1, 2, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); @@ -747,7 +809,7 @@ void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Wi if(have_different_qinfo) { - const auto requantized_output = vquantize_q8(vdequantize(wrapper::vcombine(lower_res, upper_res), input_qinfo), output_qinfo); + const auto requantized_output = vrequantize_pooling(lower_res, upper_res, requant_qinfo); lower_res = wrapper::vgetlow(requantized_output); upper_res = wrapper::vgethigh(requantized_output); } @@ -929,6 +991,10 @@ void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Wi const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform(); + const float requant_scale = output_qinfo.scale / input_qinfo.scale; + const int32_t requant_offset = output_qinfo.offset - static_cast(static_cast(input_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const T *const input_top_ptr = reinterpret_cast(_input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top)))); const T *const input_middle_ptr = reinterpret_cast(_input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1))); const T *const input_bottom_ptr = reinterpret_cast(_input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2))); @@ -1034,7 +1100,7 @@ void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Wi { if(input_qinfo != output_qinfo) { - fqres = vquantize_q8(vdequantize(fqres, input_qinfo), output_qinfo); + fqres = vrequantize_pooling(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo); } wrapper::vstore(reinterpret_cast(output.ptr()), fqres); } @@ -1042,7 +1108,7 @@ void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Wi { if(input_qinfo != output_qinfo) { - fres = vquantize_q8(vdequantize(fres, input_qinfo), output_qinfo); + fres = vrequantize_pooling(fres, requant_qinfo); } wrapper::vstore(reinterpret_cast(output.ptr()), fres); } @@ -1838,6 +1904,15 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform(); + const float quant_rescale = output_qinfo.scale / input_qinfo.scale; + // "new_offset" doesn't have to consider the "half_scale_v" in its computation + // With a requantization performed in a single step there won't be uncertainties introduced + const int32_t new_offset = output_qinfo.offset - static_cast( static_cast(input_qinfo.offset) / quant_rescale); + + const float requant_scale = output_qinfo.scale / input_qinfo.scale; + const int32_t requant_offset = output_qinfo.offset - static_cast(static_cast(input_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + execute_window_loop(window, [&](const Coordinates & id) { const int idx_width = id.y() * pool_stride_x; @@ -1860,7 +1935,6 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const // Calculate scale const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - const float32x4_t scale_v = vdupq_n_f32(scale); // Perform pooling for(int y = pool_start_y; y < pool_end_y; ++y) @@ -1878,24 +1952,38 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); } } - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + if(input_qinfo != output_qinfo) { - const auto requantized_output = vquantize_q8(vdequantize(wrapper::vcombine(res1, res2), input_qinfo), output_qinfo); - res1 = wrapper::vgetlow(requantized_output); - res2 = wrapper::vgethigh(requantized_output); + const float32x4x4_t vres = + { + { + vcvtq_f32_q32(vres1), + vcvtq_f32_q32(vres2), + vcvtq_f32_q32(vres3), + vcvtq_f32_q32(vres4), + } + }; + const auto requantized_output = vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast(output.ptr()), wrapper::vgetlow(requantized_output)); + wrapper::vstore(reinterpret_cast(output.ptr()) + 8, wrapper::vgethigh(requantized_output)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast(output.ptr()), res1); + wrapper::vstore(reinterpret_cast(output.ptr()) + 8, res2); } - - // Store result - wrapper::vstore(reinterpret_cast(output.ptr()), res1); - wrapper::vstore(reinterpret_cast(output.ptr()) + 8, res2); } else { @@ -1912,7 +2000,7 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const } // Store result - wrapper::vstore(reinterpret_cast(output.ptr()), (input_qinfo != output_qinfo) ? vquantize_q8(vdequantize(vres, input_qinfo), output_qinfo) : vres); + wrapper::vstore(reinterpret_cast(output.ptr()), (input_qinfo != output_qinfo) ? vrequantize_pooling(wrapper::vgetlow(vres), wrapper::vgethigh(vres), requant_qinfo) : vres); } }, input, output); diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp index 07d1f1f702..75c619a635 100644 --- a/tests/validation/NEON/PoolingLayer.cpp +++ b/tests/validation/NEON/PoolingLayer.cpp @@ -55,10 +55,7 @@ const auto PoolingLayerDatasetFPSmall = combine(combine(combine(datasets::Poolin /** Input data sets for asymmetric data type */ -const auto PoolingLayerDatasetQASYMM8 = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(3, 7), Size2D(7, 8) })), - framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })), - framework::dataset::make("ExcludePadding", { true })); -const auto PoolingLayerDatasetQASYMM8Small = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 7) })), +const auto PoolingLayerDatasetQASYMM8Small = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(3, 7) })), framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(1, 2, 1, 1) })), framework::dataset::make("ExcludePadding", { true })); -- cgit v1.2.1