From 9247c92bd8c53be4d0c4ae931f51ca8f88e4150b Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 28 Jun 2017 18:29:47 +0100
Subject: COMPMID-428: Port NESoftmaxLayer to 16-bit fixed point.

Change-Id: I65122950bab9124b9758c27096c0f458b77aeabb
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79365
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Steven Niu <steven.niu@arm.com>
---
 arm_compute/core/NEON/NEFixedPoint.h               |  1 +
 arm_compute/core/NEON/NEFixedPoint.inl             | 64 +++++++++++++++++++++-
 .../core/NEON/kernels/NESoftmaxLayerKernel.h       |  6 +-
 3 files changed, 66 insertions(+), 5 deletions(-)

(limited to 'arm_compute/core/NEON')

diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index e30509cd0a..09579f9120 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -46,6 +46,7 @@ using qint16x8_t   = int16x8_t;   /**< 16 bit fixed point vector with 8 elements
 using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
 using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
 using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
+using qint32x2_t   = int32x2_t;   /**< 32 bit fixed point vector with 2 elements */
 using qint32x4_t   = int32x4_t;   /**< 32 bit fixed point vector with 4 elements */
 
 /** Get the lower half of a 16 elements vector
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index b241dd5069..f62a338a61 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -384,6 +384,11 @@ inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
     return vqadd_s16(a, b);
 }
 
+inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
+{
+    return vqadd_s32(a, b);
+}
+
 inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
 {
     return vqaddq_s8(a, b);
@@ -394,6 +399,11 @@ inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
     return vqaddq_s16(a, b);
 }
 
+inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
+{
+    return vqaddq_s32(a, b);
+}
+
 inline int16x4_t vpaddl_qs8(qint8x8_t a)
 {
     return vpaddl_s8(a);
@@ -1073,6 +1083,56 @@ inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
     return vshl_s16(x, shift_value);
 }
 
+inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
+    const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
+    const qint8x8_t const_one        = vdup_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+    const qint8x8_t temp        = vqshl_s8(a, shift_value);
+
+    qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
+
+    uint8x8_t set_one = vcgt_s8(x, const_one);
+    x                 = vbsl_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vqshl_s8(x, shift_value);
+}
+
+inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
+    const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
+    const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
+    const qint16x4_t const_one        = vdup_n_s16(1 << fixed_point_position);
+
+    // Find shift value
+    const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
+    const qint16x4_t temp        = vqshl_s16(a, shift_value);
+
+    qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
+
+    uint16x4_t set_one = vcgt_s16(x, const_one);
+    x                  = vbsl_s16(set_one, const_one, x);
+
+    // Use five iterations of Newton-Raphson  method to get the result
+    x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vqshl_s16(x, shift_value);
+}
+
 inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
 {
     // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
@@ -1817,7 +1877,7 @@ inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
     qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
     qint8x8_t num   = vqsub_qs8(exp2x, const_one);
     qint8x8_t den   = vqadd_qs8(exp2x, const_one);
-    qint8x8_t tanh  = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
+    qint8x8_t tanh  = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
 
     return tanh;
 }
@@ -1830,7 +1890,7 @@ inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
     qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
     qint16x4_t num   = vqsub_qs16(exp2x, const_one);
     qint16x4_t den   = vqadd_qs16(exp2x, const_one);
-    qint16x4_t tanh  = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position);
+    qint16x4_t tanh  = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
 
     return tanh;
 }
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index ab626ad5ec..53eef8d665 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -39,7 +39,7 @@ public:
     NELogits1DMaxKernel();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
@@ -74,7 +74,7 @@ public:
     ~NELogits1DShiftExpSumKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F32.
      * @param[in]  max    Max values tensor. Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
@@ -113,7 +113,7 @@ public:
     ~NELogits1DNormKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F32.
      * @param[in]  sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      */
-- 
cgit v1.2.1