diff options
-rw-r--r-- | arm_compute/core/NEON/NEAsymm.h | 23 | ||||
-rw-r--r-- | arm_compute/core/NEON/NEAsymm.inl | 15 | ||||
-rw-r--r-- | arm_compute/core/NEON/NEMath.h | 22 | ||||
-rw-r--r-- | arm_compute/core/NEON/NEMath.inl | 28 | ||||
-rw-r--r-- | arm_compute/core/NEON/NESymm.h | 68 | ||||
-rw-r--r-- | arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h | 26 | ||||
-rw-r--r-- | arm_compute/core/QuantizationInfo.h | 1 | ||||
-rw-r--r-- | arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h | 16 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp | 76 | ||||
-rw-r--r-- | tests/validation/NEON/PixelWiseMultiplication.cpp | 175 |
10 files changed, 340 insertions, 110 deletions
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index 4c8f797360..981c7b075c 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -24,6 +24,7 @@ #ifndef __ARM_COMPUTE_NEASYMM_H__ #define __ARM_COMPUTE_NEASYMM_H__ +#include "arm_compute/core/NEON/NEMath.h" #include <arm_neon.h> namespace arm_compute @@ -34,28 +35,6 @@ using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 2 using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */ using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 16 elements */ -/** Round to the nearest division by a power-of-two using exponent - * - * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent - * - * @param[in] x Vector of 4 elements - * @param[in] exponent Integer value used to round to nearest division by a power-of-two - * - * @return the nearest division by a power-of-two using exponent - */ -int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent); - -/** Round to the nearest division by a power-of-two using exponent - * - * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent - * - * @param[in] x Element to divide. - * @param[in] exponent Integer value used to round to nearest division by a power-of-two - * - * @return the nearest division by a power-of-two using exponent - */ -int32_t rounding_divide_by_pow2(int32_t x, int exponent); - /** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector * * vd*vs + vo diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl index 209785d94e..a98c6aa390 100644 --- a/arm_compute/core/NEON/NEAsymm.inl +++ b/arm_compute/core/NEON/NEAsymm.inl @@ -23,21 +23,6 @@ */ namespace arm_compute { -inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) -{ - const int32x4_t shift_vec = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); - const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed_up_x, shift_vec); -} - -inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) -{ - const int32_t mask = (1 << exponent) - 1; - const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); - return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); -} - inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo) { // Convert uint8 vectors to uint16 vectors diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h index 46d97f6a0d..59a03c9d11 100644 --- a/arm_compute/core/NEON/NEMath.h +++ b/arm_compute/core/NEON/NEMath.h @@ -124,6 +124,28 @@ float32x4_t vtanhq_f32(float32x4_t val); */ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Vector of 4 elements + * @param[in] exponent Integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent); + +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Element to divide. + * @param[in] exponent Integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32_t rounding_divide_by_pow2(int32_t x, int exponent); + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Calculate hyperbolic tangent. * diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 172aaef941..2247c14f47 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -69,19 +69,20 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val) { #ifdef __aarch64__ return vrndnq_f32(val); -#else // __aarch64__ +#else // __aarch64__ static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f); - static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); - static const int32x4_t CONST_1_INT = vdupq_n_s32(1); - const float32x4_t floor_val = vfloorq_f32(val); - const float32x4_t diff = vsubq_f32(val, floor_val); + static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); + static const int32x4_t CONST_1_INT = vdupq_n_s32(1); + const float32x4_t floor_val = vfloorq_f32(val); + const float32x4_t diff = vsubq_f32(val, floor_val); /* * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0). * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))) */ - return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))), floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); + return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))), + floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); #endif // __aarch64__ } @@ -191,6 +192,21 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) } #endif /* DOXYGEN_SKIP_THIS */ +inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) +{ + const int32x4_t shift_vec = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + +inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) +{ + const int32_t mask = (1 << exponent) - 1; + const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); + return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Exponent polynomial coefficients */ /** Logarithm polynomial coefficients */ diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h index 364a317bc7..a60d5d0fde 100644 --- a/arm_compute/core/NEON/NESymm.h +++ b/arm_compute/core/NEON/NESymm.h @@ -24,11 +24,17 @@ #ifndef __ARM_COMPUTE_NESYMM_H__ #define __ARM_COMPUTE_NESYMM_H__ -#include "NEAsymm.h" +#include "arm_compute/core/NEON/NEMath.h" #include <arm_neon.h> namespace arm_compute { +using qsymm8_t = int8_t; /**< 8 bit quantized symmetric scalar value */ +using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */ + +using qsymm16x8_t = int16x8_t; /**< 16 bit quantized symmetric vector with 8 elements */ +using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 16 elements */ + /** Performs final quantization step on 8 signed 16-bit elements * * @tparam is_bounded_relu Specified if a fused bounded relu should be applied @@ -149,5 +155,65 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); } +/** Dequantize a neon vector holding 16 16-bit quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), + } + }; + return vdequantized_input; +} + +/** Quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + ARM_COMPUTE_ERROR_ON(scale == 0.f); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), +#endif //__aarch64__ + } + }; + const qsymm16x8x2_t res = + { + vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])), + vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])), + }; + + return res; +} + } // namespace arm_compute #endif // __ARM_COMPUTE_NESYMM_H__ diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index daa29fdf4f..e2ea90a33f 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -56,12 +56,12 @@ public: * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * - * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8 or QSYMM16. * @param[in] rounding_policy Rounding policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); @@ -70,12 +70,12 @@ public: * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * - * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QSYMM16/S16/F16/F32 + * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8 or QSYMM16. * @param[in] rounding_policy Rounding policy. * * @return a status @@ -114,12 +114,12 @@ private: * @param[in] output_qua_info Quantization Info of tensor output. * */ - using MulFunctionQASYMM8 = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale, - const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info); + using MulFunctionQuantized = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale, + const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info); - MulFunctionFloat *_func_float; - MulFunctionInt *_func_int; - MulFunctionQASYMM8 *_func_qasymm8; + MulFunctionFloat *_func_float; + MulFunctionInt *_func_int; + MulFunctionQuantized *_func_quantized; private: const ITensor *_input1; diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h index 1c49cd29ed..587a380d63 100644 --- a/arm_compute/core/QuantizationInfo.h +++ b/arm_compute/core/QuantizationInfo.h @@ -33,7 +33,6 @@ namespace arm_compute { using qasymm8_t = uint8_t; /**< 8 bit quantized asymmetric scalar value */ -using qsymm8_t = int8_t; /**< 8 bit quantized symmetric scalar value */ using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */ /** Quantization info when assuming per layer quantization */ diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index 53c27c47bf..41137c0135 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -40,14 +40,14 @@ public: * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * - * @param[in, out] input1 An input tensor. Data types supported: U8/QASYMM8/S16/F16/F32 + * @param[in, out] input1 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32 * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in, out] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8 or QSYMM16. * @param[in] rounding_policy Rounding policy. */ void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); @@ -56,12 +56,12 @@ public: * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * - * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32 + * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8 or QSYMM16. * @param[in] rounding_policy Rounding policy. * * @return a status diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index c313b23ad3..6aaac818e9 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEAsymm.h" #include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/NESymm.h" #include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" @@ -63,21 +64,30 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() != DataType::QASYMM8, - "Input2 must be QASYMM8 if both input1 is QASYMM8"); + "Input2 must be QASYMM8 if input1 is QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() == DataType::QASYMM8 && overflow_policy == ConvertPolicy::WRAP, - "ConvertPolicy cannot be WRAP if datatype is QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() != DataType::QASYMM8 && input2->data_type() == DataType::QASYMM8, + "Input1 must be QASYMM8 if input2 is QASYMM8"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QSYMM16 && input2->data_type() != DataType::QSYMM16, + "Input2 must be QSYMM16 if input1 is QSYMM16"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() != DataType::QSYMM16 && input2->data_type() == DataType::QSYMM16, + "Input1 must be QSYMM16 if input2 is QSYMM16"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input1->data_type()) && overflow_policy == ConvertPolicy::WRAP, + "ConvertPolicy cannot be WRAP if datatype is quantized"); if(output->total_size() > 0) { - if(output->data_type() == DataType::QASYMM8) + if(is_data_type_quantized(output->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); } @@ -128,6 +138,14 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu { set_format_if_unknown(*output, Format::F16); } + else if(input1->data_type() == DataType::QASYMM8) + { + set_data_type_if_unknown(*output, DataType::QASYMM8); + } + else if(input1->data_type() == DataType::QSYMM16) + { + set_data_type_if_unknown(*output, DataType::QSYMM16); + } } // Configure kernel window @@ -201,6 +219,34 @@ void mul_saturate_QASYMM8_QASYMM8_QASYMM8_n(const void *__restrict input1_ptr, c vst1q_u8(output, result); } +void mul_saturate_QSYMM16_QSYMM16_QSYMM16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale, + const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info) +{ + const auto input1 = static_cast<const qsymm16_t *__restrict>(input1_ptr); + const auto input2 = static_cast<const qsymm16_t *__restrict>(input2_ptr); + const auto output = static_cast<qsymm16_t *__restrict>(output_ptr); + + const qsymm16x8x2_t input1_q = vld2q_s16(input1); + const qsymm16x8x2_t input2_q = vld2q_s16(input2); + + // Dequantitize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + + const float32x4x4_t out_f32x4x4 = + { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); + vst2q_s16(output, result); +} + template <bool is_scale255, bool is_sat> void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n) { @@ -488,7 +534,7 @@ void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict } // namespace NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel() - : _func_float(nullptr), _func_int(nullptr), _func_qasymm8(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 } + : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 } { } @@ -508,7 +554,7 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe _output = output; _scale = scale; _scale_exponent = 0; - _func_qasymm8 = nullptr; + _func_quantized = nullptr; _func_int = nullptr; _func_float = nullptr; @@ -536,7 +582,11 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe if(dt_input1 == DataType::QASYMM8 && dt_input2 == DataType::QASYMM8) { - _func_qasymm8 = &mul_saturate_QASYMM8_QASYMM8_QASYMM8_n; + _func_quantized = &mul_saturate_QASYMM8_QASYMM8_QASYMM8_n; + } + else if(dt_input1 == DataType::QSYMM16 && dt_input2 == DataType::QSYMM16) + { + _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16_n; } else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output) { @@ -655,12 +705,12 @@ void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo Iterator input2(_input2, slice_input2); Iterator output(_output, slice); - if(_func_qasymm8 != nullptr) + if(is_data_type_quantized(_input1->info()->data_type())) { execute_window_loop(collapsed, [&](const Coordinates &) { - (*_func_qasymm8)(input1.ptr(), input2.ptr(), output.ptr(), _scale, - _input1->info()->quantization_info().uniform(), _input2->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform()); + (*_func_quantized)(input1.ptr(), input2.ptr(), output.ptr(), _scale, + _input1->info()->quantization_info().uniform(), _input2->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform()); collapsed.slide_window_slice_3D(slice_input1); collapsed.slide_window_slice_3D(slice_input2); }, diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp index 0cc97a2c26..70b88ae9c8 100644 --- a/tests/validation/NEON/PixelWiseMultiplication.cpp +++ b/tests/validation/NEON/PixelWiseMultiplication.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/Rounding.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "tests/NEON/Accessor.h" #include "tests/PaddingCalculator.h" @@ -42,8 +43,28 @@ const float scale_unity = 1.f; const float scale_255 = 1.f / 255.f; const float scale_other = 1.f / 32768.f; +constexpr AbsoluteTolerance<float> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit quantized asymmetric data types */ +constexpr AbsoluteTolerance<float> tolerance_qsymm16(1); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit quantized symmetric data types */ + +const auto PixelWiseMultiplicationQSYMM16QuantDataset = combine(combine( + framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) }), + framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })), + framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })); + +const auto PixelWiseMultiplicationQASYMM8QuantDataset = combine(combine( + framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 32768.f, 0) }), + framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })), + framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 32768.f, 0) })); + +const auto PixelWiseMultiplicationPolicySTNUDataset = combine( + framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE }), + framework::dataset::make("RoundingPolicy", { RoundingPolicy::TO_NEAREST_UP })); + +const auto PixelWiseMultiplicationPolicySTZDataset = combine( + framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE }), + framework::dataset::make("RoundingPolicy", { RoundingPolicy::TO_ZERO })); + #define DEFAULT_VALIDATE validate(Accessor(_target), _reference); -#define QASYMM8_VALIDATE validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(1), 0.f); #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f); #define WRAP_VALIDATE(TYPE, TOLERANCE) validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f); @@ -76,23 +97,6 @@ const float scale_other = 1.f / 32768.f; VALIDATE \ } -#define PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, SCALE, RP, VALIDATE) \ - FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE, \ - combine(combine(combine(combine(combine(combine(combine(combine( \ - datasets::SHAPES, \ - framework::dataset::make("DataType1", DataType::DT1)), \ - framework::dataset::make("DataType2", DataType::DT2)), \ - framework::dataset::make("Scale", std::move(SCALE))), \ - framework::dataset::make("ConvertPolicy", ConvertPolicy::SATURATE)), \ - framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)), \ - framework::dataset::make("QuantizationInfoIn1", QuantizationInfo(1.0 , 0))), \ - framework::dataset::make("QuantizationInfoIn2", QuantizationInfo(1.0 , 0))), \ - framework::dataset::make("QuantizationInfoOut", QuantizationInfo(100.0, 10)))) \ - { \ - VALIDATE \ - } - - // *INDENT-ON* // clang-format on @@ -124,7 +128,8 @@ void validate_configuration(TensorShape shape, DataType dt1, DataType dt2, float } } // namespace -using NEPixelWiseMultiplicationToQASYMM8Fixture = PixelWiseMultiplicationQuatizedValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>; +using NEPixelWiseMultiplicationQASYMM8Fixture = PixelWiseMultiplicationValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>; +using NEPixelWiseMultiplicationQSYMM16Fixture = PixelWiseMultiplicationValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, int16_t, int16_t>; template <typename T> using NEPixelWiseMultiplicationToU8Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, uint8_t>; template <typename T> @@ -179,24 +184,132 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( // clang-format on // *INDENT-ON* -TEST_SUITE(QASYMM8toQASYMM8) - +TEST_SUITE(Quantized) +TEST_SUITE(QASYMM8) TEST_SUITE(Scale255) -PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunSmall, ToQASYMM8Fixture, PRECOMMIT, SmallShapes(), QASYMM8, QASYMM8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1)) -PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunLarge, ToQASYMM8Fixture, NIGHTLY, LargeShapes(), QASYMM8, QASYMM8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("Scale", { scale_255 })), + PixelWiseMultiplicationPolicySTNUDataset), + PixelWiseMultiplicationQASYMM8QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("Scale", { scale_255 })), + PixelWiseMultiplicationPolicySTNUDataset), + PixelWiseMultiplicationQASYMM8QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} TEST_SUITE_END() // Scale255 - TEST_SUITE(ScaleUnity) -PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunSmall, ToQASYMM8Fixture, PRECOMMIT, SmallShapes(), QASYMM8, QASYMM8, scale_unity, TO_ZERO, QASYMM8_VALIDATE) -PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunLarge, ToQASYMM8Fixture, NIGHTLY, LargeShapes(), QASYMM8, QASYMM8, scale_unity, TO_ZERO, QASYMM8_VALIDATE) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("Scale", { scale_unity })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQASYMM8QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("Scale", { scale_unity })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQASYMM8QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // ScaleUnity +TEST_SUITE(ScaleOther) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("Scale", { scale_other })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQASYMM8QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("Scale", { scale_other })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQASYMM8QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // ScaleOther +TEST_SUITE_END() // QASYMM8 +TEST_SUITE(QSYMM16) +TEST_SUITE(Scale255) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(), + framework::dataset::make("DataType", DataType::QSYMM16)), + framework::dataset::make("Scale", { scale_255 })), + PixelWiseMultiplicationPolicySTNUDataset), + PixelWiseMultiplicationQSYMM16QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), + framework::dataset::make("DataType", DataType::QSYMM16)), + framework::dataset::make("Scale", { scale_255 })), + PixelWiseMultiplicationPolicySTNUDataset), + PixelWiseMultiplicationQSYMM16QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // Scale255 +TEST_SUITE(ScaleUnity) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(), + framework::dataset::make("DataType", DataType::QSYMM16)), + framework::dataset::make("Scale", { scale_unity })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQSYMM16QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), + framework::dataset::make("DataType", DataType::QSYMM16)), + framework::dataset::make("Scale", { scale_unity })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQSYMM16QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} TEST_SUITE_END() // ScaleUnity - TEST_SUITE(ScaleOther) -PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunSmall, ToQASYMM8Fixture, PRECOMMIT, SmallShapes(), QASYMM8, QASYMM8, scale_other, TO_ZERO, QASYMM8_VALIDATE) -PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunLarge, ToQASYMM8Fixture, NIGHTLY, LargeShapes(), QASYMM8, QASYMM8, scale_other, TO_ZERO, QASYMM8_VALIDATE) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(), + framework::dataset::make("DataType", DataType::QSYMM16)), + framework::dataset::make("Scale", { scale_other })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQSYMM16QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), + framework::dataset::make("DataType", DataType::QSYMM16)), + framework::dataset::make("Scale", { scale_other })), + PixelWiseMultiplicationPolicySTZDataset), + PixelWiseMultiplicationQSYMM16QuantDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} TEST_SUITE_END() // ScaleOther - -TEST_SUITE_END() // QASYMM8toQASYMM8 +TEST_SUITE_END() // QSYMM16 +TEST_SUITE_END() // Quantized TEST_SUITE(U8toU8) |