COMPMID-1800: (Nightly) Mismatches in SC9863 board for NEON FP16

Fixes for: - ReduceMean, reduction on the X axis for FP16 with 8 elements was performed only up to a certain point. The fix now takes into account the number of elements of the vector and does as many reductions as necessary. - YOLOLayer, activation for FP16 has to be performed on 32 bits until the FP16 approximations is fixed. Change-Id: I75373f4edd37de476e6fe1a56de3ef386b65c619
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2018-11-20 16:03:01 +0000
committer: Michalis Spyrou <michalis.spyrou@arm.com> 2018-11-21 14:08:19 +0000
commit: 1c948d47f55ff8a39aa527f63ea7df93a13dd38e (patch)
tree: 4579798cb95ecaf62b5d7fe61b5d3753301e2804
parent: 8b2814ab7b9dc00278132d74d2f738b843b6c0c7 (diff)
download: ComputeLibrary-1c948d47f55ff8a39aa527f63ea7df93a13dd38e.tar.gz
4 files changed, 13 insertions, 48 deletions
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 2bc1ab7964..4de80509f0 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -245,37 +245,12 @@ inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t
 
 inline float16x8_t vexpq_f16(float16x8_t x)
 {
-    static const std::array<float16x8_t, 8> exp_tab_f16 =
-    {
-        {
-            vdupq_n_f16(1.f),
-            vdupq_n_f16(0.0416598916054f),
-            vdupq_n_f16(0.500000596046f),
-            vdupq_n_f16(0.0014122662833f),
-            vdupq_n_f16(1.00000011921f),
-            vdupq_n_f16(0.00833693705499f),
-            vdupq_n_f16(0.166665703058f),
-            vdupq_n_f16(0.000195780929062f),
-        }
-    };
-
-    static const float16x8_t CONST_LN2          = vdupq_n_f16(0.6931471805f); // ln(2)
-    static const float16x8_t CONST_INV_LN2      = vdupq_n_f16(1.4426950408f); // 1/ln(2)
-    static const float16x8_t CONST_0            = vdupq_n_f16(0.f);
-    static const int16x8_t   CONST_NEGATIVE_126 = vdupq_n_s16(-126);
-
-    // Perform range reduction [-log(2),log(2)]
-    const int16x8_t   m   = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2));
-    const float16x8_t val = vsubq_f16(x, vmulq_f16(vcvtq_f16_s16(m), CONST_LN2));
+    // TODO (COMPMID-1535) : Revisit FP16 approximations
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
 
-    // Polynomial Approximation
-    float16x8_t poly = vtaylor_polyq_f16(val, exp_tab_f16);
-
-    // Reconstruct
-    poly = vreinterpretq_f16_s16(vqaddq_s16(vreinterpretq_s16_f16(poly), vqshlq_n_s16(m, 9)));
-    poly = vbslq_f16(vcltq_s16(m, CONST_NEGATIVE_126), CONST_0, poly);
-
-    return poly;
+    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high));
+    return res;
 }
 
 inline float16x8_t vlogq_f16(float16x8_t x)
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 2163f7bb63..5ce79f1007 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -184,7 +184,7 @@ typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivat
     Iterator output(_output, window);
 
     static const float16x8_t CONST_0   = vdupq_n_f16(0.f);
-    static const float16x4_t CONST_1_H = vdup_n_f16(1.f);
+    static const float16x8_t CONST_1_H = vdupq_n_f16(1.f);
 
     static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f);
 
@@ -240,24 +240,11 @@ typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivat
                 break;
             case ActivationFunction::LOGISTIC:
             {
-                // TODO (COMPMID-1535) : Revisit FP16 approximations
-                const float16x4x2_t in0 =
-                {
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[0]))))))),
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[0]))))))),
-                };
-
-                const float16x4x2_t in1 =
-                {
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[1]))))))),
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[1]))))))),
-                };
-
                 tmp =
                 {
                     {
-                        vcombine_f16(in0.val[0], in0.val[1]),
-                        vcombine_f16(in1.val[0], in1.val[1]),
+                        vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))),
+                        vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1]))))
                     }
                 };
             }
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index b77219cd79..182e93d177 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -154,7 +154,10 @@ struct RedOpX
         input);
 
         auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
-        carry_addition      = wrapper::vpadd(carry_addition, carry_addition);
+        for(int i = 0; i < S / 4; ++i)
+        {
+            carry_addition = wrapper::vpadd(carry_addition, carry_addition);
+        }
 
         auto res = wrapper::vgetlane(carry_addition, 0);
         if(op == ReductionOperation::MEAN_SUM)
diff --git a/tests/validation/NEON/YOLOLayer.cpp b/tests/validation/NEON/YOLOLayer.cpp
index 926a2dad86..6225dc170e 100644
--- a/tests/validation/NEON/YOLOLayer.cpp
+++ b/tests/validation/NEON/YOLOLayer.cpp
@@ -46,7 +46,7 @@ namespace
 /** Tolerance */
 constexpr AbsoluteTolerance<float> tolerance_f32(1e-6f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr RelativeTolerance<float> tolerance_f16(0.001f);
+constexpr RelativeTolerance<float> tolerance_f16(0.01f);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 /** Floating point data sets. */
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2018-11-20 16:03:01 +0000
committer	Michalis Spyrou <michalis.spyrou@arm.com>	2018-11-21 14:08:19 +0000
commit	1c948d47f55ff8a39aa527f63ea7df93a13dd38e (patch)
tree	4579798cb95ecaf62b5d7fe61b5d3753301e2804
parent	8b2814ab7b9dc00278132d74d2f738b843b6c0c7 (diff)
download	ComputeLibrary-1c948d47f55ff8a39aa527f63ea7df93a13dd38e.tar.gz