From d5e65c71261fd42d3e69478507fbfcc8cf36befc Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Wed, 26 Jul 2017 17:09:17 +0100
Subject: COMPMID-456: Add support for QS16 NEON Normalization Layer.

Change-Id: I1e542808cfd7774c67cc4e9a58e42449e4fb29aa
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81735
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/core/NEON/NEFixedPoint.h               | 25 ++++++++++++++++++++--
 arm_compute/core/NEON/NEFixedPoint.inl             | 17 +++++++++++++++
 .../core/NEON/kernels/NENormalizationLayerKernel.h |  4 ++--
 3 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'arm_compute/core')

diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 50463b5efe..08f680801d 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -235,13 +235,22 @@ qint8x16_t vdupq_n_qs8(qint8_t a);
 
 /** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
  *
- * @param[in] a                    8 bit fixed point to duplicate
+ * @param[in] a                    floating point value to convert and duplicate
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
  *
  * @return The result of the vector duplication
  */
 qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
 
+/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements)
+ *
+ * @param[in] a                    floating point value to convert and duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position);
+
 /** 16 bit fixed point vector duplicate (8 elements)
  *
  * @param[in] a 16 bit fixed point to duplicate
@@ -1178,7 +1187,19 @@ qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position);
  *
  * @return The result of the 8bit power.
  */
-qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 16bit (8 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a                    16bit fixed point input vector
+ * @param[in] b                    16bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16bit power.
+ */
+qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
 
 /** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
  *
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 7cebfad924..c879d3e275 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -250,6 +250,18 @@ inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
     return vqcvtq_qs8_f32(res, fixed_point_position);
 }
 
+inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
+{
+    float32x4x2_t res =
+    {
+        {
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+        }
+    };
+    return vqcvtq_qs16_f32(res, fixed_point_position);
+}
+
 inline qint16x8_t vdupq_n_qs16(qint16_t a)
 {
     return vdupq_n_s16(a);
@@ -1941,6 +1953,11 @@ inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_positio
     return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
 }
 
+inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
+{
+    return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {
     float32x4x2_t res =
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index b1bc594e4c..e24e481f46 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -50,7 +50,7 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
      * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           Data type supported: same as @p input
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -86,7 +86,7 @@ private:
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    template <unsigned int dim, bool do_2D_norm>
+    template <DataType dt, unsigned int dim, bool do_2D_norm>
     void normalize_fixed_point(const Window &window);
     /** Common signature for all the specialised normalization functions
      *
-- 
cgit v1.2.1