From 8d4d1b85bc57d5f76f3939bb422e44df68dc2342 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 28 Nov 2019 11:31:23 +0000 Subject: COMPMID-2796: Add support for QASYMM8_SIGNED in NEActivationLayer and NEPReluLayer Change-Id: I089fd19a6beab7779d690bc9ace327f661c2753d Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/2407 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- arm_compute/core/NEON/NEAsymm.h | 130 +++++++++++++++++++++ arm_compute/core/NEON/NEAsymm.inl | 33 ++++++ .../core/NEON/kernels/NEActivationLayerKernel.h | 10 +- 3 files changed, 171 insertions(+), 2 deletions(-) (limited to 'arm_compute/core/NEON') diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index 53a3ea773f..234d48882c 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -35,6 +35,12 @@ using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 2 using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */ using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 16 elements */ +using qasymm8x8_signed_t = int8x8_t; /**< 8 bit quantized signed asymmetric vector with 8 elements */ +using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ +using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */ +using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */ +using qasymm8x16_signed_t = int8x16_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ + /** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector * * vd*vs + vo @@ -47,6 +53,18 @@ using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 1 */ uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo); +/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector + * + * vd*vs + vo + * + * @param[in] vd Input vector value in QASYMM8_SIGNED format + * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. + * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. + * + * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit + */ +int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo); + /** Performs final quantization step on 16 elements * * @tparam is_bounded_relu Specified if a fused bounded relu should be applied @@ -336,6 +354,29 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI return vdequantized_input; } +/** Dequantize a neon vector holding 8 singed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), + } + }; + return vdequantized_input; +} + /** Dequantize a neon vector holding 16 quantized values. * * @param[in] qv Input values to be dequantized. @@ -361,6 +402,31 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization return vdequantized_input; } +/** Dequantize a neon vector holding 16 signed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + } + }; + return vdequantized_input; +} + /** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values. * * @param[in] qv Input values to be dequantized. @@ -456,6 +522,34 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); } +/** Quantize a neon vector holding 8 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the singed quantized values + */ +inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#endif //__aarch64__ + } + }; + return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); +} + /** Quantize a neon vector holding 16 floating point values. * * @param[in] qv Input values to be quantized. @@ -490,6 +584,42 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn return vcombine_u8(pa, pb); } +/** Signed quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ + +inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#endif //__aarch64__ + + } + }; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} + /** Quantize to QASYMM16 a neon vector holding 16 floating point values. * * @param[in] qv Input values to be quantized. diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl index a98c6aa390..71205e0403 100644 --- a/arm_compute/core/NEON/NEAsymm.inl +++ b/arm_compute/core/NEON/NEAsymm.inl @@ -56,4 +56,37 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v // convert uint16 vectors to uint8 vectors (with saturation) return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8)); } +inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo) +{ + // Convert uint8 vectors to int16 vectors + const int8x8_t vd_low = vget_low_s8(vd); + const int8x8_t vd_high = vget_high_s8(vd); + int16x8_t vd_low_s16x8 = vmovl_s8(vd_low); + int16x8_t vd_high_s16x8 = vmovl_s8(vd_high); + // Convert int16 vectors to int32 vectors + int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8)); + int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8)); + int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8)); + int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8)); + // Convert int32 vectors to float32 vectors + float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4); + float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4); + float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4); + float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4); + // vd = vd*vs + vo + A_f32x4 = vmlaq_f32(vo, A_f32x4, vs); + B_f32x4 = vmlaq_f32(vo, B_f32x4, vs); + C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); + D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); + // Convert float32 vectors to int32 vectors + A_s32x4 = vcvtq_s32_f32(A_f32x4); + B_s32x4 = vcvtq_s32_f32(B_f32x4); + C_s32x4 = vcvtq_s32_f32(C_f32x4); + D_s32x4 = vcvtq_s32_f32(D_f32x4); + // Convert int32 vectors to int16 vectors (with saturation) + vd_low_s16x8 = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4)); + vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4)); + // convert int16 vectors to int8 vectors (with saturation) + return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8)); +} } // namespace arm_compute diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h index 9f2a085b3a..82103b988b 100644 --- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -58,7 +58,7 @@ public: * @note If the output tensor is a nullptr, the activation function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] activation_info Activation layer information. */ @@ -66,7 +66,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. * @param[in] output Destination tensor info. Data type supported: same as @p input * @param[in] act_info Activation layer information. * @@ -102,6 +102,12 @@ private: * @param[in] window Region on which to execute the kernel */ template + typename std::enable_if::value, void>::type activation(const Window &window); + /** Function to apply an activation function on a tensor. + * + * @param[in] window Region on which to execute the kernel + */ + template typename std::enable_if::value, void>::type activation(const Window &window); private: -- cgit v1.2.1