From 8d4d1b85bc57d5f76f3939bb422e44df68dc2342 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 28 Nov 2019 11:31:23 +0000 Subject: COMPMID-2796: Add support for QASYMM8_SIGNED in NEActivationLayer and NEPReluLayer Change-Id: I089fd19a6beab7779d690bc9ace327f661c2753d Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/2407 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- arm_compute/core/NEON/NEAsymm.h | 130 ++++++++++++++ arm_compute/core/NEON/NEAsymm.inl | 33 ++++ .../core/NEON/kernels/NEActivationLayerKernel.h | 10 +- arm_compute/core/QuantizationInfo.h | 7 +- .../runtime/NEON/functions/NEActivationLayer.h | 4 +- arm_compute/runtime/NEON/functions/NEPReluLayer.h | 4 +- src/core/NEON/kernels/NEActivationLayerKernel.cpp | 175 +++++++++++++++++- .../NEON/kernels/NEElementwiseOperationKernel.cpp | 197 ++++++++++++++++++++- tests/validation/NEON/ActivationLayer.cpp | 19 ++ tests/validation/NEON/PReluLayer.cpp | 69 ++++---- tests/validation/fixtures/ActivationLayerFixture.h | 13 +- .../validation/reference/ElementwiseOperations.cpp | 30 ++++ 12 files changed, 637 insertions(+), 54 deletions(-) diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index 53a3ea773f..234d48882c 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -35,6 +35,12 @@ using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 2 using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */ using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 16 elements */ +using qasymm8x8_signed_t = int8x8_t; /**< 8 bit quantized signed asymmetric vector with 8 elements */ +using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ +using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */ +using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */ +using qasymm8x16_signed_t = int8x16_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ + /** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector * * vd*vs + vo @@ -47,6 +53,18 @@ using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 1 */ uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo); +/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector + * + * vd*vs + vo + * + * @param[in] vd Input vector value in QASYMM8_SIGNED format + * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. + * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. + * + * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit + */ +int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo); + /** Performs final quantization step on 16 elements * * @tparam is_bounded_relu Specified if a fused bounded relu should be applied @@ -336,6 +354,29 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI return vdequantized_input; } +/** Dequantize a neon vector holding 8 singed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), + } + }; + return vdequantized_input; +} + /** Dequantize a neon vector holding 16 quantized values. * * @param[in] qv Input values to be dequantized. @@ -361,6 +402,31 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization return vdequantized_input; } +/** Dequantize a neon vector holding 16 signed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + } + }; + return vdequantized_input; +} + /** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values. * * @param[in] qv Input values to be dequantized. @@ -456,6 +522,34 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); } +/** Quantize a neon vector holding 8 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the singed quantized values + */ +inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#endif //__aarch64__ + } + }; + return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); +} + /** Quantize a neon vector holding 16 floating point values. * * @param[in] qv Input values to be quantized. @@ -490,6 +584,42 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn return vcombine_u8(pa, pb); } +/** Signed quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ + +inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#endif //__aarch64__ + + } + }; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} + /** Quantize to QASYMM16 a neon vector holding 16 floating point values. * * @param[in] qv Input values to be quantized. diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl index a98c6aa390..71205e0403 100644 --- a/arm_compute/core/NEON/NEAsymm.inl +++ b/arm_compute/core/NEON/NEAsymm.inl @@ -56,4 +56,37 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v // convert uint16 vectors to uint8 vectors (with saturation) return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8)); } +inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo) +{ + // Convert uint8 vectors to int16 vectors + const int8x8_t vd_low = vget_low_s8(vd); + const int8x8_t vd_high = vget_high_s8(vd); + int16x8_t vd_low_s16x8 = vmovl_s8(vd_low); + int16x8_t vd_high_s16x8 = vmovl_s8(vd_high); + // Convert int16 vectors to int32 vectors + int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8)); + int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8)); + int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8)); + int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8)); + // Convert int32 vectors to float32 vectors + float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4); + float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4); + float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4); + float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4); + // vd = vd*vs + vo + A_f32x4 = vmlaq_f32(vo, A_f32x4, vs); + B_f32x4 = vmlaq_f32(vo, B_f32x4, vs); + C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); + D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); + // Convert float32 vectors to int32 vectors + A_s32x4 = vcvtq_s32_f32(A_f32x4); + B_s32x4 = vcvtq_s32_f32(B_f32x4); + C_s32x4 = vcvtq_s32_f32(C_f32x4); + D_s32x4 = vcvtq_s32_f32(D_f32x4); + // Convert int32 vectors to int16 vectors (with saturation) + vd_low_s16x8 = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4)); + vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4)); + // convert int16 vectors to int8 vectors (with saturation) + return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8)); +} } // namespace arm_compute diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h index 9f2a085b3a..82103b988b 100644 --- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -58,7 +58,7 @@ public: * @note If the output tensor is a nullptr, the activation function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] activation_info Activation layer information. */ @@ -66,7 +66,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. * @param[in] output Destination tensor info. Data type supported: same as @p input * @param[in] act_info Activation layer information. * @@ -102,6 +102,12 @@ private: * @param[in] window Region on which to execute the kernel */ template + typename std::enable_if::value, void>::type activation(const Window &window); + /** Function to apply an activation function on a tensor. + * + * @param[in] window Region on which to execute the kernel + */ + template typename std::enable_if::value, void>::type activation(const Window &window); private: diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h index 7a6fe42098..06ba665c6b 100644 --- a/arm_compute/core/QuantizationInfo.h +++ b/arm_compute/core/QuantizationInfo.h @@ -33,9 +33,10 @@ namespace arm_compute { -using qasymm8_t = uint8_t; /**< 8 bit quantized asymmetric scalar value */ -using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */ -using qasymm16_t = uint16_t; /**< 16 bit quantized asymmetric scalar value */ +using qasymm8_signed_t = int8_t; /**< 8 bit signed quantized asymmetric scalar value */ +using qasymm8_t = uint8_t; /**< 8 bit quantized asymmetric scalar value */ +using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */ +using qasymm16_t = uint16_t; /**< 16 bit quantized asymmetric scalar value */ /** Quantization info when assuming per layer quantization */ struct UniformQuantizationInfo diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h index cd9b22d397..95901dc2d8 100644 --- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h @@ -59,7 +59,7 @@ public: * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] activation_info Activation layer parameters. */ @@ -68,7 +68,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayer * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. * @param[in] output Destination tensor info. Data type supported: same as @p input * @param[in] act_info Activation layer information. * diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h index c0a1df472f..102a165383 100644 --- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h @@ -40,14 +40,14 @@ class NEPReluLayer : public INESimpleFunction public: /** Set the input and output tensor. * - * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] alpha Source alpha tensor. Data types supported: same of @p input. * @param[out] output Destination tensor. Data type supported: same as @p input */ void configure(const ITensor *input, const ITensor *alpha, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEPReluLayer * - * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] alpha Source alpha tensor info. Data types supported: same of @p input. * @param[in] output Destination tensor info. Data type supported: same as @p input * diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index c338ef09c7..44f76f6e22 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -48,7 +48,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); static std::set qasymm8_supported_activations = { @@ -72,8 +72,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (qsymm16_supported_activations.count(f_act) == 0), "For QSYMM16 only tanh and logistic are supported"); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) + && (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + && (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); @@ -173,6 +178,17 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat }; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ + // Activation functions : QASYMM8_SIGNED + static std::map act_map_qasymm8_signed = + { + { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation }, + { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::TANH, &NEActivationLayerKernel::activation }, + { ActivationFunction::IDENTITY, &NEActivationLayerKernel::activation }, + }; + // Activation functions : QASYMM8 static std::map act_map_qasymm8 = { @@ -193,6 +209,9 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat switch(input->info()->data_type()) { + case DataType::QASYMM8_SIGNED: + _func = act_map_qasymm8_signed[activation_info.activation()]; + break; case DataType::QASYMM8: _func = act_map_qasymm8[activation_info.activation()]; break; @@ -507,6 +526,156 @@ typename std::enable_if::value, void>::type NEActivat input, output); } +template +typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) +{ + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const ActivationFunction act = F; + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + + const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform(); + const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(_act_info.a(), qi_in)); + const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(_act_info.b(), qi_in)); + const qasymm8_signed_t a = quantize_qasymm8_signed(_act_info.a(), qi_in); + const qasymm8_signed_t b = quantize_qasymm8_signed(_act_info.b(), qi_in); + const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in); + const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0); + const auto vconst_1 = vdupq_n_f32(1.f); + const float32x4_t va_f32 = vdupq_n_f32(_act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b()); + const float a_f32 = _act_info.a(); + const float b_f32 = _act_info.b(); + + // Initialise scale/offset for re-quantization + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + float32x4_t vs = vdupq_n_f32(s); + float32x4_t vo = vdupq_n_f32(o); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + wrapper::traits::neon_bitvector_t tmp; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + if(act == ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_s8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if(act == ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if(act == ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), + } + }; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if(act == ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), + } + }; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + T in = *(reinterpret_cast(input_ptr + x)); + T tmp; + if(act == ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = std::max(0, std::min(tmp * s + o, 255)); + } + else if(act == ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = std::max(0, std::min(tmp * s + o, 255)); + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = std::max(0, std::min(tmp * s + o, 255)); + } + else if(act == ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if(act == ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} + template typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) { diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp index 9bd080983c..4928ae9bdd 100644 --- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp +++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp @@ -61,6 +61,21 @@ float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, return out; } +float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +{ + qasymm8x16_signed_t x = vld1q_s8(input1_ptr); + const float32x4x4_t out = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + } + }; + return out; +} + void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) { const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1]))); @@ -89,6 +104,27 @@ void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32 store_quantized(output_ptr, out); } +void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) +{ + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_s8(output_ptr, vcombine_s8(pa, pb)); +} + +void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + int32x4x4_t out = + { + { + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + } + }; + store_quantized_signed(output_ptr, out); +} + float32x4x4_t dup_quantized(qasymm8_t broadcast_value, int offset, float scale) { const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); @@ -152,6 +188,12 @@ inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const floa return quantize_qasymm8(elementwise_arithm_op_scalar(a, b), qinfo); } +template +inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + return quantize_qasymm8_signed(elementwise_arithm_op_scalar(a, b), qinfo); +} + template inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) { @@ -368,6 +410,24 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_e return x; } +template +inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_arithm_op(af, bf); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + template inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) @@ -396,6 +456,21 @@ inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, in } return x; } +template +inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} template inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, @@ -697,6 +772,114 @@ void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *o } } +void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero) + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if(is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, + vscale1, vscale2, voffseto, invvscaleo); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + template void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { @@ -733,6 +916,13 @@ void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITe &elementwise_arithm_op_quantized_broadcast_loop, &elementwise_arithm_op_quantized_loop); } +template +void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar, + &elementwise_arithm_op_quantized_signed_broadcast_loop, + &elementwise_arithm_op_quantized_singed_loop); +} template void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) @@ -773,7 +963,8 @@ configure_arithm_func(const ITensor *input1, const ITensor *input2, ITensor *out { "op_F32_F32_F32", &elementwise_arithm_op> }, { "op_S16_S16_S16", &elementwise_arithm_op> }, { "op_S32_S32_S32", &elementwise_arithm_op> }, - { "op_QASYMM8_QASYMM8_QASYMM8", &elementwise_arithm_op_quantized } + { "op_QASYMM8_QASYMM8_QASYMM8", &elementwise_arithm_op_quantized }, + { "op_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &elementwise_arithm_op_quantized_signed } }; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC map_function["op_F16_F16_F16"] = &elementwise_arithm_op>; @@ -808,8 +999,8 @@ NEElementwiseOperationKernel::NEElementwiseOperationKernel() Status NEElementwiseOperationKernel::validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp index 8c18d47da9..1b9278988a 100644 --- a/tests/validation/NEON/ActivationLayer.cpp +++ b/tests/validation/NEON/ActivationLayer.cpp @@ -263,6 +263,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEActivationLayerQuantizedFixture, fra } TEST_SUITE_END() // QASYMM8 +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), QuantizedActivationDataset), + framework::dataset::make("DataType", + DataType::QASYMM8_SIGNED)), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10.0f) }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEActivationLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), QuantizedActivationDataset), + framework::dataset::make("DataType", + DataType::QASYMM8_SIGNED)), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10.0f) }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8_SIGNED + /** Input data sets. */ const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH diff --git a/tests/validation/NEON/PReluLayer.cpp b/tests/validation/NEON/PReluLayer.cpp index d9604f94aa..0630a057d6 100644 --- a/tests/validation/NEON/PReluLayer.cpp +++ b/tests/validation/NEON/PReluLayer.cpp @@ -42,12 +42,15 @@ namespace validation { namespace { -RelativeTolerance tolerance_fp32(0.000001f); +RelativeTolerance tolerance_fp32(0.000001f); +AbsoluteTolerance tolerance_s8(1); /** Input data sets **/ const auto PReluLayerQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("DataType", DataType::QASYMM8)); +const auto PReluLayerQASYMM8SignedDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)); const auto PReluLayerFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataType", DataType::F32)); @@ -101,23 +104,6 @@ using NEPReluLayerQuantizedFixture = PReluLayerValidationQuantizedFixture(shape, DataType::QASYMM8); - Tensor ref_src2 = create_tensor(shape, DataType::QASYMM8); - Tensor dst = create_tensor(shape, DataType::QASYMM8); - - // Create and Configure function - NEPReluLayer prelu; - prelu.configure(&ref_src1, &ref_src2, &dst); - - // Validate valid region - const ValidRegion valid_region = shape_to_valid_region(shape); - validate(dst.info()->valid_region(), valid_region); -} - FIXTURE_DATA_TEST_CASE(RunSmall, NEPReluLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), PReluLayerQASYMM8Dataset), framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })), @@ -141,8 +127,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPReluLayerQuantizedFixture, framewor // Validate output validate(Accessor(_target), _reference, tolerance_fp32, 0.01); } -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // QASYMM8 + +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPReluLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), + PReluLayerQASYMM8SignedDataset), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 20) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 5) })) + + ) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_s8, 0.01); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, NEPReluLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), + PReluLayerQASYMM8SignedDataset), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 20) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 5) })) + + ) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_s8, 0.01); +} +TEST_SUITE_END() // QASYMM8_SIGNED +TEST_SUITE_END() // Quantized TEST_SUITE(Float) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -162,23 +174,6 @@ TEST_SUITE_END() // FP16 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ TEST_SUITE(FP32) -DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(), - shape) -{ - // Create tensors - Tensor ref_src1 = create_tensor(shape, DataType::F32); - Tensor ref_src2 = create_tensor(shape, DataType::F32); - Tensor dst = create_tensor(shape, DataType::F32); - - // Create and Configure function - NEPReluLayer prelu; - prelu.configure(&ref_src1, &ref_src2, &dst); - - // Validate valid region - const ValidRegion valid_region = shape_to_valid_region(shape); - validate(dst.info()->valid_region(), valid_region); -} - FIXTURE_DATA_TEST_CASE(RunSmall, NEPReluLayerFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), PReluLayerFP32Dataset)) { // Validate output diff --git a/tests/validation/fixtures/ActivationLayerFixture.h b/tests/validation/fixtures/ActivationLayerFixture.h index f6d43ddd89..3294986519 100644 --- a/tests/validation/fixtures/ActivationLayerFixture.h +++ b/tests/validation/fixtures/ActivationLayerFixture.h @@ -150,8 +150,9 @@ protected: private: QuantizationInfo calculate_output_quantization_info(DataType dt, const ActivationLayerInfo &act_info, const QuantizationInfo &default_qinfo) { - auto qasymm8_max = float(std::numeric_limits::max()) + 1.f; - auto qsymm16_max = float(std::numeric_limits::max()) + 1.f; + auto qasymm8_max = float(std::numeric_limits::max()) + 1.f; + auto qasymm8_signed_max = float(std::numeric_limits::max()) + 1.f; + auto qsymm16_max = float(std::numeric_limits::max()) + 1.f; switch(act_info.activation()) { @@ -164,6 +165,10 @@ private: { return QuantizationInfo(1.f / (0.5 * qasymm8_max), int(0.5 * qasymm8_max)); } + else if(dt == DataType::QASYMM8_SIGNED) + { + return QuantizationInfo(1.f / qasymm8_signed_max, 0); + } else { return default_qinfo; @@ -177,6 +182,10 @@ private: { return QuantizationInfo(1.f / qasymm8_max, 0); } + else if(dt == DataType::QASYMM8_SIGNED) + { + return QuantizationInfo(1.f / (2.f * qasymm8_signed_max), -int(qasymm8_signed_max)); + } else { return default_qinfo; diff --git a/tests/validation/reference/ElementwiseOperations.cpp b/tests/validation/reference/ElementwiseOperations.cpp index 7b39e18bd9..bd6eec3688 100644 --- a/tests/validation/reference/ElementwiseOperations.cpp +++ b/tests/validation/reference/ElementwiseOperations.cpp @@ -183,6 +183,36 @@ SimpleTensor arithmetic_operation(ArithmeticOperation op, const SimpleT return dst; } } +template <> +SimpleTensor arithmetic_operation(ArithmeticOperation op, const SimpleTensor &src1, const SimpleTensor &src2, SimpleTensor &dst, ConvertPolicy convert_policy) +{ + if(dst.data_type() == DataType::QASYMM8_SIGNED) + { + SimpleTensor src1_tmp = convert_from_asymmetric(src1); + SimpleTensor src2_tmp = convert_from_asymmetric(src2); + SimpleTensor dst_tmp(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst.data_type()); + + Coordinates id_src1{}; + Coordinates id_src2{}; + Coordinates id_dst{}; + + BroadcastUnroll::unroll(op, src1_tmp, src2_tmp, dst_tmp, convert_policy, id_src1, id_src2, id_dst); + + dst = convert_to_asymmetric(dst_tmp, dst.quantization_info()); + return dst; + } + else + { + // DataType::S8 + Coordinates id_src1{}; + Coordinates id_src2{}; + Coordinates id_dst{}; + + BroadcastUnroll::unroll(op, src1, src2, dst, convert_policy, id_src1, id_src2, id_dst); + + return dst; + } +} template <> SimpleTensor arithmetic_operation(ArithmeticOperation op, const SimpleTensor &src1, const SimpleTensor &src2, SimpleTensor &dst, ConvertPolicy convert_policy) -- cgit v1.2.1