diff options
-rw-r--r-- | arm_compute/core/NEON/NEMath.inl | 35 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEActivationLayerKernel.cpp | 19 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEReductionOperationKernel.cpp | 5 | ||||
-rw-r--r-- | tests/validation/NEON/YOLOLayer.cpp | 2 |
4 files changed, 13 insertions, 48 deletions
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 2bc1ab7964..4de80509f0 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -245,37 +245,12 @@ inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t inline float16x8_t vexpq_f16(float16x8_t x) { - static const std::array<float16x8_t, 8> exp_tab_f16 = - { - { - vdupq_n_f16(1.f), - vdupq_n_f16(0.0416598916054f), - vdupq_n_f16(0.500000596046f), - vdupq_n_f16(0.0014122662833f), - vdupq_n_f16(1.00000011921f), - vdupq_n_f16(0.00833693705499f), - vdupq_n_f16(0.166665703058f), - vdupq_n_f16(0.000195780929062f), - } - }; - - static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2) - static const float16x8_t CONST_INV_LN2 = vdupq_n_f16(1.4426950408f); // 1/ln(2) - static const float16x8_t CONST_0 = vdupq_n_f16(0.f); - static const int16x8_t CONST_NEGATIVE_126 = vdupq_n_s16(-126); - - // Perform range reduction [-log(2),log(2)] - const int16x8_t m = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2)); - const float16x8_t val = vsubq_f16(x, vmulq_f16(vcvtq_f16_s16(m), CONST_LN2)); + // TODO (COMPMID-1535) : Revisit FP16 approximations + const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); + const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); - // Polynomial Approximation - float16x8_t poly = vtaylor_polyq_f16(val, exp_tab_f16); - - // Reconstruct - poly = vreinterpretq_f16_s16(vqaddq_s16(vreinterpretq_s16_f16(poly), vqshlq_n_s16(m, 9))); - poly = vbslq_f16(vcltq_s16(m, CONST_NEGATIVE_126), CONST_0, poly); - - return poly; + const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high)); + return res; } inline float16x8_t vlogq_f16(float16x8_t x) diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 2163f7bb63..5ce79f1007 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -184,7 +184,7 @@ typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivat Iterator output(_output, window); static const float16x8_t CONST_0 = vdupq_n_f16(0.f); - static const float16x4_t CONST_1_H = vdup_n_f16(1.f); + static const float16x8_t CONST_1_H = vdupq_n_f16(1.f); static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f); @@ -240,24 +240,11 @@ typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivat break; case ActivationFunction::LOGISTIC: { - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float16x4x2_t in0 = - { - vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[0]))))))), - vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[0]))))))), - }; - - const float16x4x2_t in1 = - { - vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[1]))))))), - vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[1]))))))), - }; - tmp = { { - vcombine_f16(in0.val[0], in0.val[1]), - vcombine_f16(in1.val[0], in1.val[1]), + vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))), + vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1])))) } }; } diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index b77219cd79..182e93d177 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -154,7 +154,10 @@ struct RedOpX input); auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value)); - carry_addition = wrapper::vpadd(carry_addition, carry_addition); + for(int i = 0; i < S / 4; ++i) + { + carry_addition = wrapper::vpadd(carry_addition, carry_addition); + } auto res = wrapper::vgetlane(carry_addition, 0); if(op == ReductionOperation::MEAN_SUM) diff --git a/tests/validation/NEON/YOLOLayer.cpp b/tests/validation/NEON/YOLOLayer.cpp index 926a2dad86..6225dc170e 100644 --- a/tests/validation/NEON/YOLOLayer.cpp +++ b/tests/validation/NEON/YOLOLayer.cpp @@ -46,7 +46,7 @@ namespace /** Tolerance */ constexpr AbsoluteTolerance<float> tolerance_f32(1e-6f); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -constexpr RelativeTolerance<float> tolerance_f16(0.001f); +constexpr RelativeTolerance<float> tolerance_f16(0.01f); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ /** Floating point data sets. */ |