diff options
Diffstat (limited to 'src/core/NEON/SVEMath.inl')
-rw-r--r-- | src/core/NEON/SVEMath.inl | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl index cf7f9f5a95..a851b8a07b 100644 --- a/src/core/NEON/SVEMath.inl +++ b/src/core/NEON/SVEMath.inl @@ -325,5 +325,73 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) #endif /* defined(__ARM_FEATURE_SVE2) */ } +template <> +inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +{ + svuint8_t out; + const auto all_true_pg = svptrue_b32(); + auto tmp_0 = svcvt_u32_f32_z(all_true_pg, in_0); + auto tmp_1 = svcvt_u32_f32_z(all_true_pg, in_1); + auto tmp_2 = svcvt_u32_f32_z(all_true_pg, in_2); + auto tmp_3 = svcvt_u32_f32_z(all_true_pg, in_3); + + auto tmp_16_0 = svqxtnt_u32(svqxtnb_u32(tmp_0), tmp_1); + auto tmp_16_1 = svqxtnt_u32(svqxtnb_u32(tmp_2), tmp_3); + + auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0); + auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0); + auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1); + auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1); + + auto pg = svwhilelt_b16_s32(0, svcnth() / 2); + + tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1); + tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3); + + out = svqxtnt_u16(svqxtnb_u16(tmp_16_0), tmp_16_1); + + auto out_uzp_0 = svuzp1(out, out); + auto out_uzp_1 = svuzp2(out, out); + + pg = svwhilelt_b8_s32(0, svcntb() / 2); + out = svsplice(pg, out_uzp_0, out_uzp_1); + + return out; +} + +template <> +inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +{ + svint8_t out; + const auto all_true_pg = svptrue_b32(); + auto tmp_0 = svcvt_s32_f32_z(all_true_pg, in_0); + auto tmp_1 = svcvt_s32_f32_z(all_true_pg, in_1); + auto tmp_2 = svcvt_s32_f32_z(all_true_pg, in_2); + auto tmp_3 = svcvt_s32_f32_z(all_true_pg, in_3); + + auto tmp_16_0 = svqxtnt_s32(svqxtnb_s32(tmp_0), tmp_1); + auto tmp_16_1 = svqxtnt_s32(svqxtnb_s32(tmp_2), tmp_3); + + auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0); + auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0); + auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1); + auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1); + + auto pg = svwhilelt_b16_s32(0, svcnth() / 2); + + tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1); + tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3); + + out = svqxtnt_s16(svqxtnb_s16(tmp_16_0), tmp_16_1); + + auto out_uzp_0 = svuzp1(out, out); + auto out_uzp_1 = svuzp2(out, out); + + pg = svwhilelt_b8_s32(0, svcntb() / 2); + out = svsplice(pg, out_uzp_0, out_uzp_1); + + return out; +} + } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ |