From dbdea0d1c025b18d4d82c278c87454427918f5b4 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 16 Oct 2019 19:21:40 +0100
Subject: COMPMID-2308: NEConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL
 filters

Change-Id: Ic1bf5f0d21ccd525f84213a360f7e199d7f50577
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2177
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/NEON/NEAsymm.h | 99 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

(limited to 'arm_compute/core/NEON/NEAsymm.h')
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index 56d4c09f92..a3bd7e28f0 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -115,6 +115,66 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
     return out_u8;
 }
 
+/** Performs final quantization step on 16 elements for symmetric quantization
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param in_s32                        Input to be quantized.
+ * @param result_fixedpoint_multiplier  Result multiplier parameter
+ * @param result_shift                  Result shift parameter
+ * @param result_offset_after_shift_s32 Result offset parameter
+ * @param min_s8                        Relu lower bound
+ * @param max_s8                        Relu upper bound
+ *
+ * @return Quantized values
+ */
+template <bool   is_bounded_relu>
+inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
+                                            const int32x4x4_t &result_fixedpoint_multiplier,
+                                            const int32x4x4_t &result_shift,
+                                            const int32x4_t   &result_offset_after_shift_s32,
+                                            const int8x16_t   &min_s8,
+                                            const int8x16_t   &max_s8)
+{
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    in_s32.val[0] = vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]);
+    in_s32.val[1] = vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]);
+    in_s32.val[2] = vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]);
+    in_s32.val[3] = vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]);
+
+    // Round to the nearest division by a power-of-two using result_shift_s32
+    in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift.val[0]);
+    in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift.val[1]);
+    in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift.val[2]);
+    in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift.val[3]);
+
+    // Add the offset terms
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
 /** Performs final quantization step on single element
  *
  * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
@@ -154,6 +214,45 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
     return out_u8;
 }
 
+/** Performs final quantization step on single element
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param[in] in_value                      Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ *
+ * @return Quantized value
+ */
+template <bool is_bounded_relu>
+inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
+                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
+                                    int8_t min_s8, int8_t max_s8)
+{
+    int32x4_t in_s32 = vdupq_n_s32(in_value);
+
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
+
+    // Shift value by result_shift_s32
+    in_value = rounding_divide_by_pow2(in_value, result_shift);
+
+    // Add the offset term
+    in_value += result_offset_after_shift_s32;
+
+    // Bound the result
+    int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+    if(is_bounded_relu)
+    {
+        out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
+    }
+
+    return out_s8;
+}
+
 /** Dequantize a neon vector holding 8 quantized values.
  *
  * @param[in] qv Input values to be dequantized.
-- 
cgit v1.2.1