From d66094e37ecd747e85f30130e1a678bdbaf30788 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Mon, 15 Apr 2019 15:44:17 +0100
Subject: COMPMID-1995: Fix NEPoolingLayer for quantized 3x3

Quantized 3x3 pooling layer on NEON did not support different
quantization information for the input and output.

Change-Id: I38f8da6ec91c91ba37a21d9d0e1a14fd5bb99f86
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/992
Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/NEON/NEAsymm.h                | 60 +++++++++++++++++++++++---
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 33 +++++++++++---
 2 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index f71626182c..253d0fdff7 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -175,10 +175,33 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
     return out_u8;
 }
 
+/** Dequantize a neon vector holding 8 quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x2_t vdequantize(const uint8x8_t &qv, const QuantizationInfo &qi)
+{
+    const float         scale   = qi.scale;
+    const int           offset  = qi.offset;
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
 /** Dequantize a neon vector holding 16 quantized values.
  *
- * @param qv                            Input values to be dequantized.
- * @param qi                            Quantization information to be used in the computation.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
  *
  * @return Dequantized values in a neon vector
  */
@@ -200,10 +223,38 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const QuantizationInfo &q
     return vdequantized_input;
 }
 
+/** Quantize a neon vector holding 8 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline uint8x8_t vquantize(const float32x4x2_t &qv, const QuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+}
+
 /** Quantize a neon vector holding 16 floating point values.
  *
- * @param qv                            Input values to be quantized.
- * @param qi                            Quantization information to be used in the computation.
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
  *
  * @return A neon vector holding the quantized values
  */
@@ -233,7 +284,6 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const QuantizationInfo &qi)
     const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
     return vcombine_u8(pa, pb);
 }
-
 } // namespace arm_compute
 #include "arm_compute/core/NEON/NEAsymm.inl"
 #endif // __ARM_COMPUTE_NEASYMM_H__
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 308fad5ffb..0b90d9f290 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -813,6 +813,9 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
+    const QuantizationInfo &input_qinfo  = _input->info()->quantization_info();
+    const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+
     const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
     const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
     const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
@@ -822,6 +825,8 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
         const auto top_data    = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
         const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
         const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
+        uint8x8_t  fres        = {};
+        uint8x16_t fqres       = {};
 
         if(pooling_type == PoolingType::AVG)
         {
@@ -877,7 +882,7 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
                 scale_vector_s16x8(exclude_padding, res, id, 0, 1,
                                    pool_size, upper_bound_w, upper_bound_h,
                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
+                fres = vmovn_u16(res);
             }
             else
             {
@@ -889,8 +894,7 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
                 scale_vector_s16x8(exclude_padding, final_sum.val[1], id, 8, 1,
                                    pool_size, upper_bound_w, upper_bound_h,
                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
-                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+                fqres = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
             }
         }
         else
@@ -904,13 +908,30 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
             {
                 const uint8x8x2_t      table      = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
                 static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                const uint8x8_t        res        = vtbl2_u8(table, lookup_val);
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+                fres                              = vtbl2_u8(table, lookup_val);
             }
             else
             {
-                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
+                fqres = final_max;
+            }
+        }
+
+        // Store result
+        if(pool_stride_x == 1)
+        {
+            if(input_qinfo != output_qinfo)
+            {
+                fqres = vquantize(vdequantize(fqres, input_qinfo), output_qinfo);
+            }
+            vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), fqres);
+        }
+        else
+        {
+            if(input_qinfo != output_qinfo)
+            {
+                fres = vquantize(vdequantize(fres, input_qinfo), output_qinfo);
             }
+            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), fres);
         }
     },
     input, output);
-- 
cgit v1.2.1