From 7bb56c6337997281df10fa28ad7924c921b920eb Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 26 Jun 2019 15:17:09 +0100 Subject: COMPMID-2409: Add QSYMM16 support for PixelWiseMultiplication for NEON Change-Id: Idfd3b45857201d5143242f9517d3353150b2c923 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/1422 Reviewed-by: Pablo Marquez Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- arm_compute/core/NEON/NEAsymm.h | 23 +------- arm_compute/core/NEON/NEAsymm.inl | 15 ----- arm_compute/core/NEON/NEMath.h | 22 +++++++ arm_compute/core/NEON/NEMath.inl | 28 +++++++-- arm_compute/core/NEON/NESymm.h | 68 +++++++++++++++++++++- .../NEON/kernels/NEPixelWiseMultiplicationKernel.h | 26 ++++----- 6 files changed, 125 insertions(+), 57 deletions(-) (limited to 'arm_compute/core/NEON') diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index 4c8f797360..981c7b075c 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -24,6 +24,7 @@ #ifndef __ARM_COMPUTE_NEASYMM_H__ #define __ARM_COMPUTE_NEASYMM_H__ +#include "arm_compute/core/NEON/NEMath.h" #include namespace arm_compute @@ -34,28 +35,6 @@ using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 2 using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */ using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 16 elements */ -/** Round to the nearest division by a power-of-two using exponent - * - * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent - * - * @param[in] x Vector of 4 elements - * @param[in] exponent Integer value used to round to nearest division by a power-of-two - * - * @return the nearest division by a power-of-two using exponent - */ -int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent); - -/** Round to the nearest division by a power-of-two using exponent - * - * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent - * - * @param[in] x Element to divide. - * @param[in] exponent Integer value used to round to nearest division by a power-of-two - * - * @return the nearest division by a power-of-two using exponent - */ -int32_t rounding_divide_by_pow2(int32_t x, int exponent); - /** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector * * vd*vs + vo diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl index 209785d94e..a98c6aa390 100644 --- a/arm_compute/core/NEON/NEAsymm.inl +++ b/arm_compute/core/NEON/NEAsymm.inl @@ -23,21 +23,6 @@ */ namespace arm_compute { -inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) -{ - const int32x4_t shift_vec = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); - const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed_up_x, shift_vec); -} - -inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) -{ - const int32_t mask = (1 << exponent) - 1; - const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); - return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); -} - inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo) { // Convert uint8 vectors to uint16 vectors diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h index 46d97f6a0d..59a03c9d11 100644 --- a/arm_compute/core/NEON/NEMath.h +++ b/arm_compute/core/NEON/NEMath.h @@ -124,6 +124,28 @@ float32x4_t vtanhq_f32(float32x4_t val); */ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Vector of 4 elements + * @param[in] exponent Integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent); + +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Element to divide. + * @param[in] exponent Integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32_t rounding_divide_by_pow2(int32_t x, int exponent); + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Calculate hyperbolic tangent. * diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 172aaef941..2247c14f47 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -69,19 +69,20 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val) { #ifdef __aarch64__ return vrndnq_f32(val); -#else // __aarch64__ +#else // __aarch64__ static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f); - static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); - static const int32x4_t CONST_1_INT = vdupq_n_s32(1); - const float32x4_t floor_val = vfloorq_f32(val); - const float32x4_t diff = vsubq_f32(val, floor_val); + static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); + static const int32x4_t CONST_1_INT = vdupq_n_s32(1); + const float32x4_t floor_val = vfloorq_f32(val); + const float32x4_t diff = vsubq_f32(val, floor_val); /* * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0). * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))) */ - return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))), floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); + return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))), + floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); #endif // __aarch64__ } @@ -191,6 +192,21 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) } #endif /* DOXYGEN_SKIP_THIS */ +inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) +{ + const int32x4_t shift_vec = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + +inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) +{ + const int32_t mask = (1 << exponent) - 1; + const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); + return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Exponent polynomial coefficients */ /** Logarithm polynomial coefficients */ diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h index 364a317bc7..a60d5d0fde 100644 --- a/arm_compute/core/NEON/NESymm.h +++ b/arm_compute/core/NEON/NESymm.h @@ -24,11 +24,17 @@ #ifndef __ARM_COMPUTE_NESYMM_H__ #define __ARM_COMPUTE_NESYMM_H__ -#include "NEAsymm.h" +#include "arm_compute/core/NEON/NEMath.h" #include namespace arm_compute { +using qsymm8_t = int8_t; /**< 8 bit quantized symmetric scalar value */ +using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */ + +using qsymm16x8_t = int16x8_t; /**< 16 bit quantized symmetric vector with 8 elements */ +using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 16 elements */ + /** Performs final quantization step on 8 signed 16-bit elements * * @tparam is_bounded_relu Specified if a fused bounded relu should be applied @@ -149,5 +155,65 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); } +/** Dequantize a neon vector holding 16 16-bit quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), + } + }; + return vdequantized_input; +} + +/** Quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + ARM_COMPUTE_ERROR_ON(scale == 0.f); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), +#endif //__aarch64__ + } + }; + const qsymm16x8x2_t res = + { + vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])), + vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])), + }; + + return res; +} + } // namespace arm_compute #endif // __ARM_COMPUTE_NESYMM_H__ diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index daa29fdf4f..e2ea90a33f 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -56,12 +56,12 @@ public: * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * - * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8 or QSYMM16. * @param[in] rounding_policy Rounding policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); @@ -70,12 +70,12 @@ public: * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * - * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QSYMM16/S16/F16/F32 + * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8 or QSYMM16. * @param[in] rounding_policy Rounding policy. * * @return a status @@ -114,12 +114,12 @@ private: * @param[in] output_qua_info Quantization Info of tensor output. * */ - using MulFunctionQASYMM8 = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale, - const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info); + using MulFunctionQuantized = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale, + const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info); - MulFunctionFloat *_func_float; - MulFunctionInt *_func_int; - MulFunctionQASYMM8 *_func_qasymm8; + MulFunctionFloat *_func_float; + MulFunctionInt *_func_int; + MulFunctionQuantized *_func_quantized; private: const ITensor *_input1; -- cgit v1.2.1