From df24618b53cffed1c574e11e9fd4ba7740f8c009 Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Mon, 3 Jul 2017 16:25:09 +0100
Subject: COMPMID-421: Added FP16 suppot to NENormalizationLayer and
 NEPixelWiseMultiplication.

Change-Id: If174f8071502fc5cc94b27cd44a9b1d5e451a9e2
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79553
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 arm_compute/core/NEON/NEMath.inl | 98 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

(limited to 'arm_compute/core/NEON/NEMath.inl')

diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 9a49493cf6..c73c54501f 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -141,4 +141,100 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 {
     return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
 }
-}
\ No newline at end of file
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/* Exponent polynomial coefficients */
+const std::array<float16x8_t, 8> exp_tab_f16 =
+{
+    {
+        vdupq_n_f16(1.f),
+        vdupq_n_f16(0.0416598916054f),
+        vdupq_n_f16(0.500000596046f),
+        vdupq_n_f16(0.0014122662833f),
+        vdupq_n_f16(1.00000011921f),
+        vdupq_n_f16(0.00833693705499f),
+        vdupq_n_f16(0.166665703058f),
+        vdupq_n_f16(0.000195780929062f),
+    }
+};
+
+/* Logarithm polynomial coefficients */
+const std::array<float16x8_t, 8> log_tab_f16 =
+{
+    {
+        vdupq_n_f16(-2.29561495781f),
+        vdupq_n_f16(-2.47071170807f),
+        vdupq_n_f16(-5.68692588806f),
+        vdupq_n_f16(-0.165253549814f),
+        vdupq_n_f16(5.17591238022f),
+        vdupq_n_f16(0.844007015228f),
+        vdupq_n_f16(4.58445882797f),
+        vdupq_n_f16(0.0141278216615f),
+    }
+};
+
+inline float16x8_t vinvq_f16(float16x8_t x)
+{
+    float16x8_t recip = vrecpeq_f16(x);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    return recip;
+}
+
+inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t, 8> &coeffs)
+{
+    const float16x8_t A   = vaddq_f16(coeffs[0], vmulq_f16(coeffs[4], x));
+    const float16x8_t B   = vaddq_f16(coeffs[2], vmulq_f16(coeffs[6], x));
+    const float16x8_t C   = vaddq_f16(coeffs[1], vmulq_f16(coeffs[5], x));
+    const float16x8_t D   = vaddq_f16(coeffs[3], vmulq_f16(coeffs[7], x));
+    const float16x8_t x2  = vmulq_f16(x, x);
+    const float16x8_t x4  = vmulq_f16(x2, x2);
+    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
+    return res;
+}
+
+inline float16x8_t vexpq_f16(float16x8_t x)
+{
+    static const float16x8_t CONST_LN2          = vdupq_n_f16(0.6931471805f); // ln(2)
+    static const float16x8_t CONST_INV_LN2      = vdupq_n_f16(1.4426950408f); // 1/ln(2)
+    static const float16x8_t CONST_0            = vdupq_n_f16(0.f);
+    static const int16x8_t   CONST_NEGATIVE_126 = vdupq_n_s16(-126);
+
+    // Perform range reduction [-log(2),log(2)]
+    const int16x8_t   m   = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2));
+    const float16x8_t val = vsubq_f16(x, vmulq_f16(vcvtq_f16_s16(m), CONST_LN2));
+
+    // Polynomial Approximation
+    float16x8_t poly = vtaylor_polyq_f16(val, exp_tab_f16);
+
+    // Reconstruct
+    poly = vreinterpretq_f16_s16(vqaddq_s16(vreinterpretq_s16_f16(poly), vqshlq_n_s16(m, 9)));
+    poly = vbslq_f16(vcltq_s16(m, CONST_NEGATIVE_126), CONST_0, poly);
+
+    return poly;
+}
+
+inline float16x8_t vlogq_f16(float16x8_t x)
+{
+    static const int16x8_t   CONST_127 = vdupq_n_s16(127);           // 127
+    static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2)
+
+    // Extract exponent
+    const int16x8_t   m   = vsubq_s16(vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_f16(x), 9)), CONST_127);
+    const float16x8_t val = vreinterpretq_f16_s16(vsubq_s16(vreinterpretq_s16_f16(x), vshlq_n_s16(m, 9)));
+
+    // Polynomial Approximation
+    float16x8_t poly = vtaylor_polyq_f16(val, log_tab_f16);
+
+    // Reconstruct
+    poly = vaddq_f16(poly, vmulq_f16(vcvtq_f16_s16(m), CONST_LN2));
+
+    return poly;
+}
+
+inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
+{
+    return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
-- 
cgit v1.2.1