diff options
Diffstat (limited to 'src/core/NEON/SVEMath.inl')
-rw-r--r-- | src/core/NEON/SVEMath.inl | 70 |
1 files changed, 48 insertions, 22 deletions
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl index 8973d0b273..b30125dcb7 100644 --- a/src/core/NEON/SVEMath.inl +++ b/src/core/NEON/SVEMath.inl @@ -32,8 +32,16 @@ namespace arm_compute { -inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3, - svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8) +inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, + svfloat32_t x, + svfloat32_t coeff_1, + svfloat32_t coeff_2, + svfloat32_t coeff_3, + svfloat32_t coeff_4, + svfloat32_t coeff_5, + svfloat32_t coeff_6, + svfloat32_t coeff_7, + svfloat32_t coeff_8) { const auto A = svmla_f32_z(pg, coeff_1, coeff_5, x); const auto B = svmla_f32_z(pg, coeff_3, coeff_7, x); @@ -45,8 +53,16 @@ inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t c return res; } -inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3, - svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8) +inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, + svfloat16_t x, + svfloat16_t coeff_1, + svfloat16_t coeff_2, + svfloat16_t coeff_3, + svfloat16_t coeff_4, + svfloat16_t coeff_5, + svfloat16_t coeff_6, + svfloat16_t coeff_7, + svfloat16_t coeff_8) { const auto A = svmla_f16_z(pg, coeff_1, coeff_5, x); const auto B = svmla_f16_z(pg, coeff_3, coeff_7, x); @@ -90,15 +106,17 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3])); const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4])); - const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f - const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f - const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f - const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f + const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f + const auto neg_ln2_hi = + svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f + const auto neg_ln2_lo = + svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity()); - const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) + const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) const auto zero = svdup_n_f32(0.f); - const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) + const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) // Range reduction: // e^x = 2^n * e^r @@ -114,23 +132,23 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) // (i.e. n) because the decimal part has been pushed out and lost. // * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent // in FP32 format. Left shifting z by 23 bits will result in 2^n. - const auto z = svmla_f32_z(pg, shift, x, inv_ln2); - const auto n = svsub_f32_z(pg, z, shift); - const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n + const auto z = svmla_f32_z(pg, shift, x, inv_ln2); + const auto n = svsub_f32_z(pg, z, shift); + const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32. // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance. const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi); - const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); + const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); // Compute the truncated Taylor series of e^r. // poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5) const auto r2 = svmul_f32_z(pg, r, r); - const auto p1 = svmul_f32_z(pg, c1, r); - const auto p23 = svmla_f32_z(pg, c2, c3, r); - const auto p45 = svmla_f32_z(pg, c4, c5, r); - const auto p2345 = svmla_f32_z(pg, p23, p45, r2); + const auto p1 = svmul_f32_z(pg, c1, r); + const auto p23 = svmla_f32_z(pg, c2, c3, r); + const auto p45 = svmla_f32_z(pg, c4, c5, r); + const auto p2345 = svmla_f32_z(pg, p23, p45, r2); const auto p12345 = svmla_f32_z(pg, p1, p2345, r2); auto poly = svmla_f32_z(pg, scale, p12345, scale); @@ -213,7 +231,8 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x) auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23))); // Polynomial Approximation - auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8); + auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, + log_tab_7, log_tab_8); // Reconstruct poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2); @@ -259,7 +278,8 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val) //Find positive or negative const auto c_v = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v))); const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0))); - const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0))); + const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), + wrapper::svdup_n(IntType(0))); auto neg_v = sveor_z(pg, odd_v, sign_v); @@ -347,7 +367,10 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) #if defined(ARM_COMPUTE_ENABLE_SVE2) template <> -inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3) { svuint8_t out; const auto all_true_pg = svptrue_b32(); @@ -381,7 +404,10 @@ inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const } template <> -inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3) { svint8_t out; const auto all_true_pg = svptrue_b32(); |