From 1b80b6c7255e41257fed3b4dd0fa018e2eeee4c2 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Mon, 17 Jul 2017 15:06:34 +0100 Subject: COMPMID-447: Support scaling factors different than 1 for QS8/QS16 NEPixelWiseMultiplication. Change-Id: I6d90a18df861d53546bdca982192b4ffc0dbb3c2 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80794 Tested-by: Kaizen Reviewed-by: Pablo Tello --- .../kernels/NEPixelWiseMultiplicationKernel.cpp | 108 ++++++++++++++------- tests/validation/NEON/PixelWiseMultiplication.cpp | 90 +++++++++++++++++ tests/validation/TensorOperations.h | 20 ++-- 3 files changed, 176 insertions(+), 42 deletions(-) diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index 150db39695..33663eb57d 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -131,60 +131,100 @@ void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict in template void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position) { - // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0. - ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication"); - ARM_COMPUTE_UNUSED(n); - - const auto input1 = static_cast(input1_ptr); - const auto input2 = static_cast(input2_ptr); const auto output = static_cast(output_ptr); - const qint8x16_t ta1 = vld1q_qs8(input1); - const qint8x16_t ta2 = vld1q_qs8(input2); + const qint8x16_t ta1 = vld1q_qs8(static_cast(input1_ptr)); + const qint8x16_t ta2 = vld1q_qs8(static_cast(input2_ptr)); + + if(is_scale255) + { + qint16x8_t tmp1_high = vmovl_s8(vget_high_s8(ta1)); + qint16x8_t tmp1_low = vmovl_s8(vget_low_s8(ta1)); + const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2)); + const qint16x8_t tmp2_low = vmovl_s8(vget_low_s8(ta2)); + + const float32x4x2_t scale255_f32 = + { + { + scale255_constant_f32q, + scale255_constant_f32q + } + }; + const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position); - qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position); + tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position); + tmp1_low = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position); + tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position); + tmp1_low = vmulq_qs16(tmp1_low, scale255, fixed_point_position); - vst1q_s8(output, res); + if(is_sat) + { + vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high))); + } + else + { + vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high))); + } + } + else + { + const qint8x16_t vn = vdupq_n_s8(-n); + qint8x16_t res = ta2; + + if(is_sat) + { + res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn); + } + else + { + res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn); + } + vst1q_qs8(output, res); + } } template void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position) { - // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0. - ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 16-bit fixed-point pixel-wise multiplication"); - ARM_COMPUTE_UNUSED(n); - const qint16x8x2_t ta1 = vld2q_qs16(static_cast(input1_ptr)); - const qint16x8x2_t ta2 = vld2q_qs16(static_cast(input2_ptr)); + qint16x8x2_t res = vld2q_qs16(static_cast(input2_ptr)); - if(is_sat) + if(is_scale255) { - const qint16x8x2_t res = + const float32x4x2_t scale255_f32 = { { - // First 8 elements - vqmulq_qs16(ta1.val[0], ta2.val[0], fixed_point_position), - // Second 8 elements - vqmulq_qs16(ta1.val[1], ta2.val[1], fixed_point_position) + scale255_constant_f32q, + scale255_constant_f32q } }; - - vst2q_s16(static_cast(output_ptr), res); + const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position); + if(is_sat) + { + res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position); + res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position); + } + else + { + res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position); + res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position); + } } else { - const qint16x8x2_t res = + const qint16x8_t vn = vdupq_n_s16(-n); + if(is_sat) { - { - // First 8 elements - vmulq_qs16(ta1.val[0], ta2.val[0], fixed_point_position), - // Second 8 elements - vmulq_qs16(ta1.val[1], ta2.val[1], fixed_point_position) - } - }; - - vst2q_s16(static_cast(output_ptr), res); + res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn); + res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn); + } + else + { + res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn); + res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn); + } } + vst2q_s16(static_cast(output_ptr), res); } template @@ -438,6 +478,8 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe { // Check that all data types are the same and all fixed-point positions are the same ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output); + // Check if scale is representable in fixed-point with the provided settings + ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1); } _input1 = input1; diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp index c46ad6a9d7..f80944821c 100644 --- a/tests/validation/NEON/PixelWiseMultiplication.cpp +++ b/tests/validation/NEON/PixelWiseMultiplication.cpp @@ -442,6 +442,36 @@ BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * DataType::QS8 *ConvertPolicies() validate(Accessor(dst), ref_dst); } +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallScale255, SmallShapes() * DataType::QS8 * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_NEAREST_UP * boost::unit_test::data::xrange(1, 7), + shape, dt, scale, convert_policy, rounding_policy, fixed_point_position) +{ + // Compute function + Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy); + + // Validate output + validate(Accessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallScaleOther, SmallShapes() * DataType::QS8 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 7), + shape, dt, convert_policy, rounding_policy, fixed_point_position) +{ + const float scale = 1.f / static_cast(1 << fixed_point_position); + + // Compute function + Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy); + + // Validate output + validate(Accessor(dst), ref_dst, 1.f); +} + BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * DataType::QS8 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 7), shape, dt, convert_policy, rounding_policy, fixed_point_position) @@ -455,6 +485,36 @@ BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * DataType::QS8 *ConvertPolicies() // Validate output validate(Accessor(dst), ref_dst); } + +BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) +BOOST_DATA_TEST_CASE(RunLargeScale255, LargeShapes() * DataType::QS8 * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 7), + shape, dt, scale, convert_policy, rounding_policy, fixed_point_position) +{ + // Compute function + Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy); + + // Validate output + validate(Accessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) +BOOST_DATA_TEST_CASE(RunLargeScaleOther, LargeShapes() * DataType::QS8 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 7), + shape, dt, convert_policy, rounding_policy, fixed_point_position) +{ + const float scale = 1.f / static_cast(1 << fixed_point_position); + + // Compute function + Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy); + + // Validate output + validate(Accessor(dst), ref_dst, 1.f); +} BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE(QS16) @@ -472,6 +532,36 @@ BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * DataType::QS16 *ConvertPolicies() validate(Accessor(dst), ref_dst); } +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallScale255, SmallShapes() * DataType::QS16 * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_NEAREST_UP * boost::unit_test::data::xrange(1, 15), + shape, dt, scale, convert_policy, rounding_policy, fixed_point_position) +{ + // Compute function + Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy); + + // Validate output + validate(Accessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallScaleOther, SmallShapes() * DataType::QS16 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 15), + shape, dt, convert_policy, rounding_policy, fixed_point_position) +{ + const float scale = 1.f / static_cast(1 << fixed_point_position); + + // Compute function + Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy); + + // Validate output + validate(Accessor(dst), ref_dst, 1.f); +} + BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * DataType::QS16 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 15), shape, dt, convert_policy, rounding_policy, fixed_point_position) diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h index 4d067ac748..319047816c 100644 --- a/tests/validation/TensorOperations.h +++ b/tests/validation/TensorOperations.h @@ -866,7 +866,7 @@ void pixel_wise_multiplication(const Tensor &in1, const Tensor &in2, Ten // Fixed-point Pixel-wise Multiplication template ::value>::type> -void fixed_point_pixel_wise_multiplication(const Tensor &in1, const Tensor &in2, Tensor &out, int scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy) +void fixed_point_pixel_wise_multiplication(const Tensor &in1, const Tensor &in2, Tensor &out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy) { using namespace fixed_point_arithmetic; @@ -881,18 +881,20 @@ void fixed_point_pixel_wise_multiplication(const Tensor &in1, const Tensor ARM_COMPUTE_ERROR_ON((in1.data_type() == DataType::QS8) && (fixed_point_position == 0 || fixed_point_position > 7)); ARM_COMPUTE_ERROR_ON((in1.data_type() == DataType::QS16) && (fixed_point_position == 0 || fixed_point_position > 15)); - fixed_point fp_scale(scale, fixed_point_position); - const bool is_sat = convert_policy == ConvertPolicy::SATURATE; - const bool do_scaling = scale != 1; + const fixed_point fp_scale(scale, fixed_point_position); + const bool is_sat = convert_policy == ConvertPolicy::SATURATE; for(int i = 0; i < in1.num_elements(); ++i) { - fixed_point val1(in1[i], fixed_point_position, true); - fixed_point val2(in2[i], fixed_point_position, true); - fixed_point res = (is_sat) ? val1 * val2 : mul(val1, val2); - if(do_scaling) + const fixed_point val1(in1[i], fixed_point_position, true); + fixed_point res(in2[i], fixed_point_position, true); + if(is_sat) { - res = (is_sat) ? res * fp_scale : mul(res, fp_scale); + res = mul(mul(res, val1), fp_scale); + } + else + { + res = mul(mul(res, val1), fp_scale); } out[i] = res.raw(); } -- cgit v1.2.1