From c3a74200a66ea5fb718b8406bed2043bc097930e Mon Sep 17 00:00:00 2001 From: Sang-Hoon Park Date: Fri, 22 Nov 2019 16:05:46 +0000 Subject: COMPMID-2775 [NE] add support for QASYMM8_SIGNED to SoftmaxLayer Change-Id: Ic46d4143929c8c9b548355d85c78542faf25d612 Signed-off-by: Sang-Hoon Park Reviewed-on: https://review.mlplatform.org/c/2376 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas --- arm_compute/core/NEON/NEColorConvertHelper.inl | 20 ++++++++++---------- arm_compute/core/NEON/NEMath.h | 17 ++++++++++++++++- arm_compute/core/NEON/NEMath.inl | 25 ++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 12 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl index 62c6eb5aea..7145d6f206 100644 --- a/arm_compute/core/NEON/NEColorConvertHelper.inl +++ b/arm_compute/core/NEON/NEColorConvertHelper.inl @@ -83,7 +83,7 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s - arm_compute::convert_float32x4x4_to_unit8x16(out_float32, out); + arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); } inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, @@ -214,12 +214,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); } - arm_compute::convert_float32x4x4_to_unit8x16(fyvec_top, vec_top.val[0]); - arm_compute::convert_float32x4x4_to_unit8x16(fuvec_top, vec_top.val[1]); - arm_compute::convert_float32x4x4_to_unit8x16(fvvec_top, vec_top.val[2]); - arm_compute::convert_float32x4x4_to_unit8x16(fyvec_bottom, vec_bottom.val[0]); - arm_compute::convert_float32x4x4_to_unit8x16(fuvec_bottom, vec_bottom.val[1]); - arm_compute::convert_float32x4x4_to_unit8x16(fvvec_bottom, vec_bottom.val[2]); + arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]); + arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); } inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, @@ -298,9 +298,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co } uint8x16_t yvec, uvec, vvec; - arm_compute::convert_float32x4x4_to_unit8x16(fyvec, yvec); - arm_compute::convert_float32x4x4_to_unit8x16(fuvec, uvec); - arm_compute::convert_float32x4x4_to_unit8x16(fvvec, vvec); + arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec); vst1q_u8(out_y, yvec); vst1q_u8(out_u, uvec); diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h index aa3054306c..54f8252250 100644 --- a/arm_compute/core/NEON/NEMath.h +++ b/arm_compute/core/NEON/NEMath.h @@ -165,6 +165,14 @@ int32_t rounding_divide_by_pow2(int32_t x, int exponent); */ float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in); +/** Converts from int8x16 to float32x4x4_t + * + * @param[in] in Vector of int8 to be converted + * + * @return Converted vector of float + */ +float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in); + /** Converts from two float32x4x3_t to just one uint8x8x3_t * * @param[in] in1 First input vector of float to be converted @@ -178,7 +186,14 @@ void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x * @param[in] in Vector of float to be converted * @param[out] out Converted vector of uint8 to store the result */ -void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out); +void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out); + +/** Converts from float32x4x4_t to just one int8x16_t + * + * @param[in] in Vector of float to be converted + * @param[out] out Converted vector of uint8 to store the result + */ +void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out); /** Calculate sine. * diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 179f1b6299..5d8b82c281 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -331,6 +331,20 @@ inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in) return out; } +inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in) +{ + float32x4x4_t out; + + const auto tmp1 = vmovl_s8(vget_low_s8(in)); + out.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1))); + out.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1))); + + const auto tmp2 = vmovl_s8(vget_high_s8(in)); + out.val[2] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2))); + out.val[3] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2))); + return out; +} + inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out) { out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), @@ -341,7 +355,7 @@ inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const flo vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); } -inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out) +inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out) { const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1]))); @@ -350,6 +364,15 @@ inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); } +inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out) +{ + const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), + vqmovn_s32(vcvtq_s32_f32(in.val[1]))); + const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), + vqmovn_s32(vcvtq_s32_f32(in.val[3]))); + out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Exponent polynomial coefficients */ /** Logarithm polynomial coefficients */ -- cgit v1.2.1