diff options
author | Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> | 2023-02-09 11:52:06 +0000 |
---|---|---|
committer | Mohmun02 <MohammedSuhail.Munshi@arm.com> | 2023-02-15 16:13:08 +0000 |
commit | ecaa10a5594b58f14f5962cdda71d9313d4f3aa8 (patch) | |
tree | 55a0ec0557710438bdece14739f0c145c6789725 | |
parent | 2cc6e0bcdcca5035670f5fec95c3621f619a8acd (diff) | |
download | ComputeLibrary-ecaa10a5594b58f14f5962cdda71d9313d4f3aa8.tar.gz |
Fix Intermittent Neon™ ReduceMean QASYMM8 Mismatch
- Dividing scale by number of elements causes accuracy loss due to limitations in float datatype and truncation to int
- Adds rounding after division on aarch64 to negate this.
Resolves: [COMPMID-5839]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: I54ef0f7e56f39da1fa5f30378f551b5ca419a61d
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/492456
Tested-by: bsgcomp <bsgcomp@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9110
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | src/core/NEON/kernels/NEReductionOperationKernel.cpp | 28 | ||||
-rw-r--r-- | tests/validation/fixtures/ReduceMeanFixture.h | 8 | ||||
-rw-r--r-- | tests/validation/reference/ReductionOperation.cpp | 54 | ||||
-rw-r--r-- | tests/validation/reference/ReductionOperation.h | 3 |
4 files changed, 59 insertions, 34 deletions
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index e0f43ab176..19955af493 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -625,13 +625,13 @@ struct RedOpX_quantized Iterator input(in, in_win_no_pad); Iterator output(out, out_window); - const float in_offset = static_cast<float>(iq_info.offset); + const auto in_offset = static_cast<float>(iq_info.offset); const float in_scale = iq_info.scale; - const float out_offset = static_cast<float>(oq_info.offset); + const auto out_offset = static_cast<float>(oq_info.offset); const float out_scale = oq_info.scale; - const float num_elements = static_cast<float>(in_info.dimension(0)); + const auto num_elements = static_cast<float>(in_info.dimension(0)); const float A = in_scale / (out_scale * num_elements); const float B = out_offset - (in_scale * in_offset) / (out_scale); @@ -1382,10 +1382,17 @@ struct RedOpYZW_quantized vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A); vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A); - vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f); +#ifdef __aarch64__ + vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f); +#else // defined(__aarch64__) + vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f); +#endif // __aarch64__ const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); @@ -1521,7 +1528,12 @@ struct RedOpYZW_quantized { case ReductionOperation::MEAN_SUM: { + // Apply previously calculated coefficients (with rounding on aarch64) +#ifdef __aarch64__ + const int32_t res = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B); +#else // defined(__aarch64__) const int32_t res = A * (static_cast<float>(res_value_q)) + B; +#endif // __aarch64__ *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res); break; } diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h index 304630e9f5..13354eedc1 100644 --- a/tests/validation/fixtures/ReduceMeanFixture.h +++ b/tests/validation/fixtures/ReduceMeanFixture.h @@ -124,7 +124,13 @@ protected: { TensorShape output_shape = i == 0 ? src_shape : out.shape(); output_shape.set(axis[i], 1); - out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM, quantization_info_output); + bool is_opencl = false; + +#ifdef ARM_COMPUTE_OPENCL_ENABLED + is_opencl = std::is_same<CLTensor, TensorType>::value; // Round down to zero on opencl to match kernel +#endif /* ARM_COMPUTE_OPENCL_ENABLED */ + out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM, quantization_info_output, is_opencl ? RoundingPolicy::TO_ZERO : RoundingPolicy::TO_NEAREST_UP); + } if(!keep_dims) diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp index ffb79f86c5..e2890afb9f 100644 --- a/tests/validation/reference/ReductionOperation.cpp +++ b/tests/validation/reference/ReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "ReductionOperation.h" - #include "tests/validation/Helpers.h" #include <algorithm> @@ -39,7 +38,7 @@ namespace reference namespace { template <typename T, typename OT> -OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride) +OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride, RoundingPolicy policy) { using type = typename std::remove_cv<OT>::type; T res; @@ -99,7 +98,14 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in } if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0) { - int_res /= reduce_elements; + // Only use rounding in aarch64 to be consistent with kernel +#ifdef __aarch64__ + // Divide in float format, then rounded to nearest and implicitly cast back to int + int_res = round(static_cast<float>(int_res) / static_cast<float>(reduce_elements), policy); +#else // defined(__aarch64__) + ARM_COMPUTE_UNUSED(policy); + int_res /= reduce_elements; // Legacy compatibility +#endif // __aarch64 } res = static_cast<type>(int_res); } @@ -175,7 +181,7 @@ OT reduce_operation_arg_min_max(const T *ptr, int reduce_elements, ReductionOper } // namespace template <typename T, typename OT> -SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op) +SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, RoundingPolicy policy) { // Create reference const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX); @@ -197,7 +203,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T const T *src_row_ptr = src.data() + du * reduce_elems; dst[du] = is_arg_min_max ? reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, 1) : - reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1); + reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1, policy); } } break; @@ -213,7 +219,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T const T *src_row_ptr = src.data() + in_offset; dst[out_offset] = is_arg_min_max ? reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width) : - reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width); + reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width, policy); } } } @@ -232,7 +238,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T const T *src_row_ptr = src.data() + in_offset; dst[out_offset] = is_arg_min_max ? reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height) : - reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height); + reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height, policy); } } } @@ -254,7 +260,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T const T *src_row_ptr = src.data() + in_offset; dst[out_offset] = is_arg_min_max ? reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth) : - reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth); + reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth, policy); } } } @@ -269,21 +275,21 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T } template <typename T, typename OT> -SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output) +SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output, RoundingPolicy policy) { ARM_COMPUTE_UNUSED(quantization_info_output); - return compute_reduction_operation<T, OT>(src, dst_shape, axis, op); + return compute_reduction_operation<T, OT>(src, dst_shape, axis, op, policy); } template <> -SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output) +SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output, RoundingPolicy policy) { if(src.data_type() == DataType::QASYMM8) { // If the operation is MEAN_SUM, we can directly use the uint8 implementation without taking into account scale and offset if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output) { - return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op); + return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, policy); } else { @@ -294,19 +300,19 @@ SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, cons } else { - return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op); + return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, policy); } } template <> -SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output) +SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output, RoundingPolicy policy) { if(src.data_type() == DataType::QASYMM8_SIGNED) { // If the operation is MEAN_SUM, we can directly use the int8 implementation without taking into account scale and offset if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output) { - return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op); + return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, policy); } else { @@ -317,25 +323,25 @@ SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const } else { - return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op); + return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, policy); } } template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); } // namespace reference } // namespace validation diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h index 9c9e721b29..dd97778834 100644 --- a/tests/validation/reference/ReductionOperation.h +++ b/tests/validation/reference/ReductionOperation.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_TEST_REDUCTION_OPERATION_H #include "tests/SimpleTensor.h" +#include "arm_compute/core/Rounding.h" #include "tests/validation/Helpers.h" namespace arm_compute @@ -37,7 +38,7 @@ namespace reference { template <typename T, typename OT> SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, - QuantizationInfo quantization_info_output = QuantizationInfo()); + QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO); } // namespace reference } // namespace validation } // namespace test |