From c3a74200a66ea5fb718b8406bed2043bc097930e Mon Sep 17 00:00:00 2001 From: Sang-Hoon Park Date: Fri, 22 Nov 2019 16:05:46 +0000 Subject: COMPMID-2775 [NE] add support for QASYMM8_SIGNED to SoftmaxLayer Change-Id: Ic46d4143929c8c9b548355d85c78542faf25d612 Signed-off-by: Sang-Hoon Park Reviewed-on: https://review.mlplatform.org/c/2376 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas --- arm_compute/core/NEON/NEMath.inl | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arm_compute/core/NEON/NEMath.inl') diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 179f1b6299..5d8b82c281 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -331,6 +331,20 @@ inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in) return out; } +inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in) +{ + float32x4x4_t out; + + const auto tmp1 = vmovl_s8(vget_low_s8(in)); + out.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1))); + out.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1))); + + const auto tmp2 = vmovl_s8(vget_high_s8(in)); + out.val[2] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2))); + out.val[3] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2))); + return out; +} + inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out) { out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), @@ -341,7 +355,7 @@ inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const flo vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); } -inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out) +inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out) { const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1]))); @@ -350,6 +364,15 @@ inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); } +inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out) +{ + const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), + vqmovn_s32(vcvtq_s32_f32(in.val[1]))); + const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), + vqmovn_s32(vcvtq_s32_f32(in.val[3]))); + out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Exponent polynomial coefficients */ /** Logarithm polynomial coefficients */ -- cgit v1.2.1