From 58c5794b917dae10ff115dd85ec69e2ca41136c1 Mon Sep 17 00:00:00 2001
From: Gian Marco <gianmarco.iodice@arm.com>
Date: Tue, 28 Nov 2017 09:10:03 +0000
Subject: COMPMID-706 - Add GEMMLowp output stage for scaling by a fixed point
 number

DoD:
- Implement NEON kernel for quantizing down the gemmlowp result. The
  result should be scaled by a fixedpoint number
- Implement OpenCL kernel for quantizing down the gemmlowp result. The
  result should be scaled by a fixedpoint number
- Add test for validating the result

Required for:
- Integration of GEMMLowp in Android NN
- Convolution quantized
- Fully connected quantized

Change-Id: Ia963d25d695471e963961fb49a5600e78374ac4f
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110981
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 src/core/CL/cl_kernels/gemmlowp.cl | 89 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 3 deletions(-)

(limited to 'src/core/CL/cl_kernels')

diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 16f8fe9f7f..a8e8e600fe 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -508,6 +508,9 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
 
     int16 input_values = vload16(0, (__global int *)src.ptr);
 
+    // Add the offset terms to GEMM's result
+    input_values += (int16)RESULT_OFFSET;
+
 #if defined(ADD_BIAS)
     // Add bias
     const int16 biases_values = vload16(0, (__global int *)biases.ptr);
@@ -515,10 +518,90 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
 #endif // defined(ADD_BIAS)
 
     // Multiply by result_mult_int and shift
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_MULT_INT, RESULT_SHIFT, 16);
+    input_values *= RESULT_MULT_INT;
+
+    input_values >>= RESULT_SHIFT;
+
+    uchar16 res = convert_uchar16_sat(input_values);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore16(res, 0, dst.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#if defined(ADD_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+#endif // defined(ADD_BIAS)
+
+    int16 input_values = vload16(0, (__global int *)src.ptr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    const int16 biases_values = vload16(0, (__global int *)biases.ptr);
+    input_values += (int16)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
 
     // Add the offset terms to GEMM's result
-    input_values += (int16)RESULT_OFFSET;
+    input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
 
     uchar16 res = convert_uchar16_sat(input_values);
 
@@ -532,4 +615,4 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
     // Store the result
     vstore16(res, 0, dst.ptr);
 }
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
\ No newline at end of file
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
\ No newline at end of file
-- 
cgit v1.2.1