From 19bd412fd044197726dbd8c756dbd74a9e33fd2b Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Wed, 22 Jan 2020 10:27:06 +0000 Subject: COMPMID-3022 Fix NEON QASYMM8 Reduction Operation Use proper offsets in our tests and subtract the accumulated offsets when we do a SUM operation. Also change the reference MEAN_SUM impementation to use floats. As a result the tolerance is increased to 2. Change-Id: Icac8b84680bd880fd30dc1dd82d19cce43921eb5 Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/2616 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- .../core/NEON/wrapper/intrinsics/intrinsics.h | 1 + arm_compute/core/NEON/wrapper/intrinsics/qmovun.h | 46 +++++++++ .../NEON/kernels/NEReductionOperationKernel.cpp | 103 +++++++++++++-------- .../NEON/functions/NEReductionOperation.cpp | 8 +- tests/validation/CL/ReduceMean.cpp | 4 +- tests/validation/NEON/ReduceMean.cpp | 4 +- tests/validation/NEON/ReductionOperation.cpp | 19 +--- tests/validation/reference/ReductionOperation.cpp | 4 +- 8 files changed, 130 insertions(+), 59 deletions(-) create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/qmovun.h diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h index 3d674757e8..26b4322fa4 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h @@ -57,6 +57,7 @@ #include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h" #include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h" #include "arm_compute/core/NEON/wrapper/intrinsics/pow.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/qmovun.h" #include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h" #include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h" #include "arm_compute/core/NEON/wrapper/intrinsics/round.h" diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h b/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h new file mode 100644 index 0000000000..a0347020db --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_QMOVUN_H +#define ARM_COMPUTE_WRAPPER_QMOVUN_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VQMOVUN_IMPL(dtype, vtype, prefix, postfix) \ + inline dtype vqmovun(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VQMOVUN_IMPL(uint32x2_t, int64x2_t, vqmovun, s64) +VQMOVUN_IMPL(uint16x4_t, int32x4_t, vqmovun, s32) +VQMOVUN_IMPL(uint8x8_t, int16x8_t, vqmovun, s16) + +#undef VQMOVUN_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_QMOVUN_H */ diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 5dc4b556d1..da82bc2f6f 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/SaturateCast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/NEON/wrapper/wrapper.h" @@ -690,14 +691,19 @@ struct RedOpX_qasymm8 auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); - auto res = wrapper::vgetlane(carry_paddition, 0); + auto res = static_cast(wrapper::vgetlane(carry_paddition, 0)); if(op == ReductionOperation::MEAN_SUM) { res /= in_info.dimension(0); } + else + { + // Subtract accumulated offsets + res -= (in_info.dimension(0) - 1) * iq_info.offset; + } - *(output.ptr()) = static_cast(res); + *(output.ptr()) = utils::cast::saturate_cast(res); } } } @@ -878,15 +884,15 @@ struct RedOpYZW_qasymm8 execute_window_loop(in_slice, [&](const Coordinates &) { uint32x4x4_t vec_res_idx{ { 0 } }; - auto vec_res_value1 = vdupq_n_u32(0); - auto vec_res_value2 = vdupq_n_u32(0); - auto vec_res_value3 = vdupq_n_u32(0); - auto vec_res_value4 = vdupq_n_u32(0); + auto vec_res_value1 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + auto vec_res_value2 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + auto vec_res_value3 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + auto vec_res_value4 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - auto vec_res_value1_f = vdupq_n_f32(1); - auto vec_res_value2_f = vdupq_n_f32(1); - auto vec_res_value3_f = vdupq_n_f32(1); - auto vec_res_value4_f = vdupq_n_f32(1); + auto vec_res_value1_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + auto vec_res_value2_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + auto vec_res_value3_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + auto vec_res_value4_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); auto vec_res_value = wrapper::vloadq(input.ptr()); @@ -915,16 +921,16 @@ struct RedOpYZW_qasymm8 } case ReductionOperation::PROD: { - const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); - const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); + const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); + const auto scale32x4f_4 = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); - const auto temp16x8t_1 = vmovl_u8(vget_low_u8(vec_elements)); - const auto temp16x8t_2 = vmovl_u8(vget_high_u8(vec_elements)); + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - const auto temp32x4t_1 = vmovl_u16(vget_low_u16(temp16x8t_1)); - const auto temp32x4t_2 = vmovl_u16(vget_high_u16(temp16x8t_1)); - const auto temp32x4t_3 = vmovl_u16(vget_low_u16(temp16x8t_2)); - const auto temp32x4t_4 = vmovl_u16(vget_high_u16(temp16x8t_2)); + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); auto temp32x4f_1 = vcvtq_f32_u32(temp32x4t_1); auto temp32x4f_2 = vcvtq_f32_u32(temp32x4t_2); @@ -932,15 +938,15 @@ struct RedOpYZW_qasymm8 auto temp32x4f_4 = vcvtq_f32_u32(temp32x4t_4); //de-quantize vec_elements - temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); + temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); break; } case ReductionOperation::ARG_IDX_MIN: @@ -974,7 +980,7 @@ struct RedOpYZW_qasymm8 if(op == ReductionOperation::MEAN_SUM) { - const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis))); + const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast(in_info.dimension(axis)), wrapper::traits::vector_128_tag{})); vec_res_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv); vec_res_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv); vec_res_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv); @@ -987,14 +993,14 @@ struct RedOpYZW_qasymm8 } else if(op == ReductionOperation::PROD) { - const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); + const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); //re-quantize - vec_res_value1_f = vaddq_f32(vmulq_f32(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); - vec_res_value2_f = vaddq_f32(vmulq_f32(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); - vec_res_value3_f = vaddq_f32(vmulq_f32(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); - vec_res_value4_f = vaddq_f32(vmulq_f32(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); + vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); + vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); + vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); + vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f); vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f); @@ -1015,10 +1021,33 @@ struct RedOpYZW_qasymm8 } else { - const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - wrapper::vstore(output.ptr(), res); + if(op == ReductionOperation::SUM) + { + // Subtract offsets + auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); + + auto vec_res_s_value1 = vreinterpretq_s32_u32(vec_res_value1); + auto vec_res_s_value2 = vreinterpretq_s32_u32(vec_res_value2); + auto vec_res_s_value3 = vreinterpretq_s32_u32(vec_res_value3); + auto vec_res_s_value4 = vreinterpretq_s32_u32(vec_res_value4); + + vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); + vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); + vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); + vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); + + const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); + const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); + auto res = wrapper::vcombine(wrapper::vqmovun(temp16x8t_1), wrapper::vqmovun(temp16x8t_2)); + wrapper::vstore(output.ptr(), res); + } + else + { + const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + wrapper::vstore(output.ptr(), res); + } } }, diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp index 4e37659d30..43205ef762 100644 --- a/src/runtime/NEON/functions/NEReductionOperation.cpp +++ b/src/runtime/NEON/functions/NEReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -194,11 +194,15 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i } case ReductionOperation::ARG_IDX_MAX: case ReductionOperation::ARG_IDX_MIN: + { + pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); + break; + } case ReductionOperation::MEAN_SUM: case ReductionOperation::SUM_SQUARE: case ReductionOperation::SUM: { - pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); + pixelValue = PixelValue(static_cast(0)); break; } default: diff --git a/tests/validation/CL/ReduceMean.cpp b/tests/validation/CL/ReduceMean.cpp index 1b7400bf53..036ea181ac 100644 --- a/tests/validation/CL/ReduceMean.cpp +++ b/tests/validation/CL/ReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,7 +44,7 @@ namespace { constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */ constexpr AbsoluteTolerance tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */ -constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric quantized type */ +constexpr AbsoluteTolerance tolerance_qasymm8(2); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric quantized type */ const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }), framework::dataset::make("KeepDims", { true })); diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp index 6d0caf7160..210c3670d4 100644 --- a/tests/validation/NEON/ReduceMean.cpp +++ b/tests/validation/NEON/ReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value f #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC constexpr AbsoluteTolerance tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */ #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric quantized type */ +constexpr AbsoluteTolerance tolerance_qasymm8(2); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric quantized type */ const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }), framework::dataset::make("KeepDims", { true })); diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp index 3a7f707d23..93f1a80735 100644 --- a/tests/validation/NEON/ReductionOperation.cpp +++ b/tests/validation/NEON/ReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -58,9 +58,9 @@ const auto ReductionOperations = framework::dataset::make("ReductionOperation", const auto QuantizationInfos = framework::dataset::make("QuantizationInfo", { - QuantizationInfo(1.f / 128, -10), - QuantizationInfo(1.f / 64, -5), - QuantizationInfo(1.f / 32, -2) + QuantizationInfo(1.f / 128, 10), + QuantizationInfo(1.f / 64, 5), + QuantizationInfo(1.f / 32, 2) }); const auto Axises = framework::dataset::make("Axis", @@ -127,7 +127,7 @@ template using NEReductionOperationQuantizedFixture = ReductionOperationQuantizedFixture; TEST_SUITE(QASYMM8) -FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture, framework::DatasetMode::PRECOMMIT, +FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), Axises), ReductionOperations), QuantizationInfos), @@ -136,15 +136,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture, // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEReductionOperationQuantizedFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), Axises), - ReductionOperations), - QuantizationInfos), - KeepDims)) -{ - // Validate output - validate(Accessor(_target), _reference, tolerance_qasymm8); -} TEST_SUITE_END() // QASYMM8 TEST_SUITE_END() // ReductionOperation diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp index 330a3b82ec..9c2c8eeb94 100644 --- a/tests/validation/reference/ReductionOperation.cpp +++ b/tests/validation/reference/ReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -277,7 +277,7 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorSha template <> SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op) { - if(src.data_type() == DataType::QASYMM8 && op != ReductionOperation::MEAN_SUM) + if(src.data_type() == DataType::QASYMM8) { SimpleTensor src_f = convert_from_asymmetric(src); SimpleTensor dst_f = reference::reduction_operation(src_f, dst_shape, axis, op); -- cgit v1.2.1