From bbd9fb95daa08d6da67c567b40ca2cd032f7a2d3 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 22 Jun 2017 12:57:51 +0100
Subject: COMPMID-412: Port PoolingLayer to use fixed point 16.

Change-Id: I2005de4c7c14526996309826d33a0ec8e732d2d5
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78720
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Steven Niu <steven.niu@arm.com>
---
 arm_compute/core/FixedPoint.h                      | 18 ++++++++++
 arm_compute/core/FixedPoint.inl                    | 16 +++++++++
 arm_compute/core/NEON/NEFixedPoint.inl             | 40 +++++++++++++---------
 .../core/NEON/kernels/NEPoolingLayerKernel.h       | 16 ++++++++-
 4 files changed, 73 insertions(+), 17 deletions(-)

(limited to 'arm_compute')
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
index f166d93c3e..82c2d3347e 100644
--- a/arm_compute/core/FixedPoint.h
+++ b/arm_compute/core/FixedPoint.h
@@ -40,6 +40,24 @@ using qint64_t = int64_t; /**< 64 bit fixed point scalar value */
  */
 qint8_t sqshl_qs8(qint8_t a, int shift);
 
+/** 8 bit fixed point scalar shift right
+ *
+ * @param[in] a     First 8 bit fixed point input
+ * @param[in] shift Shift amount (positive only values)
+ *
+ * @return The result of the 8 bit fixed point shift
+ */
+qint8_t sshr_qs8(qint8_t a, int shift);
+
+/** 16 bit fixed point scalar shift right
+ *
+ * @param[in] a     First 16 bit fixed point input
+ * @param[in] shift Shift amount (positive only values)
+ *
+ * @return The result of the 16 bit fixed point shift
+ */
+qint16_t sshr_qs16(qint16_t a, int shift);
+
 /** 16 bit fixed point scalar saturating shift left
  *
  * @param[in] a     First 16 bit fixed point input
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
index b921b32ed9..5ea0f6c825 100644
--- a/arm_compute/core/FixedPoint.inl
+++ b/arm_compute/core/FixedPoint.inl
@@ -21,6 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Error.h"
+
 #include <cmath>
 #include <limits>
 
@@ -59,6 +61,20 @@ inline qint16_t sqshl_qs16(qint16_t a, int shift)
     return saturate_convert<qint32_t, qint16_t>(tmp);
 }
 
+inline qint8_t sshr_qs8(qint8_t a, int shift)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero");
+    const qint8_t round_val = 1 << (shift - 1);
+    return sqadd_qs8(a, round_val) >> shift;
+}
+
+inline qint16_t sshr_qs16(qint16_t a, int shift)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero");
+    const qint16_t round_val = 1 << (shift - 1);
+    return sqadd_qs16(a, round_val) >> shift;
+}
+
 inline qint8_t sabs_qs8(qint8_t a)
 {
     return (a < 0) ? (a == std::numeric_limits<int8_t>::min()) ? std::numeric_limits<int8_t>::max() : -a : a;
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index dd1066d6bc..a5d9e7685d 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -25,8 +25,9 @@
 
 namespace arm_compute
 {
-/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
- *  Format is in Q0.7 for all elements */
+/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
+ *  Format is in Q0.7 for all elements
+ */
 static const std::array<qint8x8_t, 4> exp_tab_qs8 =
 {
     {
@@ -37,8 +38,9 @@ static const std::array<qint8x8_t, 4> exp_tab_qs8 =
     }
 };
 
-/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
- *  Format is in Q0.15 for all elements */
+/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
+ *  Format is in Q0.15 for all elements
+ */
 static const std::array<qint16x4_t, 4> exp_tab_qs16 =
 {
     {
@@ -49,8 +51,9 @@ static const std::array<qint16x4_t, 4> exp_tab_qs16 =
     }
 };
 
-/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
- * Format is in Q0.7 for all elements */
+/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
+ *  Format is in Q0.7 for all elements
+ */
 static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
 {
     {
@@ -61,8 +64,9 @@ static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
     }
 };
 
-/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements */
+/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
+ *  Format is in Q0.15 for all elements
+ */
 static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
 {
     {
@@ -73,8 +77,9 @@ static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
     }
 };
 
-/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
- * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
+ *  Format is in Q0.7 for all elements except the first one which is in Q1.6
+ */
 static const std::array<qint8x8_t, 4> log_tab_qs8 =
 {
     {
@@ -85,8 +90,9 @@ static const std::array<qint8x8_t, 4> log_tab_qs8 =
     }
 };
 
-/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
+/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
+ *  Format is in Q0.15 for all elements except the first one which is in Q1.14
+ */
 static const std::array<qint16x4_t, 4> log_tab_qs16 =
 {
     {
@@ -97,8 +103,9 @@ static const std::array<qint16x4_t, 4> log_tab_qs16 =
     }
 };
 
-/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
- * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
+ *  Format is in Q0.7 for all elements except the first one which is in Q1.6
+ */
 static const std::array<qint8x16_t, 4> log_tabq_qs8 =
 {
     {
@@ -109,8 +116,9 @@ static const std::array<qint8x16_t, 4> log_tabq_qs8 =
     }
 };
 
-/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
+/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
+ *  Format is in Q0.15 for all elements except the first one which is in Q1.14
+ */
 static const std::array<qint16x8_t, 4> log_tabq_qs16 =
 {
     {
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index a5de81137b..8a938a7f34 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -48,7 +48,7 @@ public:
     ~NEPoolingLayerKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: QS8/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QS8/QS16/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
@@ -81,6 +81,13 @@ private:
      */
     template <PoolingType pooling_type>
     void pooling2_q8(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling for 16bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_q16(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -102,6 +109,13 @@ private:
      */
     template <PoolingType pooling_type>
     void pooling3_q8(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling for 16bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_q16(const Window &window_input, const Window &window);
     /** Function to perform 7x7 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
-- 
cgit v1.2.1