From 671d4f01d96b62a24cf0688059118a1e7908650e Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Wed, 14 Oct 2020 12:26:51 +0100
Subject: COMPMID-3724: Remove OpenCL padding:
 CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel

COMPMID-3725: Remove OpenCL padding: CLGEMMLowpQuantizeDownInt32ScaleKernel

Change-Id: Idea5974a56861efae3bc255f1224c7f1e88f3650
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4182
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/gemmlowp.cl | 98 ++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 42 deletions(-)

(limited to 'src/core/CL/cl_kernels/gemmlowp.cl')

diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 29314ec581..c962d3c20e 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1896,6 +1896,7 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -1925,7 +1926,7 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
                                                   TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    int x = get_global_id(0) * 4;
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
@@ -1933,18 +1934,20 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
-    input_values += (int4)biases_values;
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
 #endif // defined(ADD_BIAS)
 
     // Add the offset terms to GEMM's result
-    input_values += (int4)RESULT_OFFSET;
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
 
     // Multiply by result_mult_int and shift
     input_values *= RESULT_MULT_INT;
@@ -1955,18 +1958,18 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
     input_values >>= RESULT_SHIFT;
 #endif // RESULT_SHIFT < 0
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
 
@@ -1991,7 +1994,8 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -2021,7 +2025,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
                                                              TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0);
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
@@ -2029,38 +2033,40 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
-    input_values += (int4)biases_values;
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
 #endif // defined(ADD_BIAS)
 
     // Multiply by result_mult_int and shift
 #if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #endif // RESULT_SHIFT < 0
 
     // Add the offset terms to GEMM's result
-    input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
 
@@ -2083,7 +2089,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
  * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -2113,7 +2120,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
                                                                      TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0);
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
@@ -2121,34 +2128,37 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
-    input_values += (int4)biases_values;
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
 #endif // defined(ADD_BIAS)
 
     // Multiply by result_mult_int and shift
 #if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #endif // RESULT_SHIFT < 0
 
-    short4 res0 = convert_short4_sat(input_values);
+    VEC_DATA_TYPE(short, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res0 = max(res0, (short4)MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res0 = min(res0, (short4)MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    STORE_VECTOR_SELECT(res, short, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
 
@@ -2173,6 +2183,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -2208,7 +2220,7 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
 #endif // defined(DST_HEIGHT)
 {
     // Compute source and destination addresses
-    int x = get_global_id(0) * 4;
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
@@ -2216,13 +2228,15 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
     input_values += (int4)biases_values;
 #endif // defined(ADD_BIAS)
 
@@ -2230,17 +2244,17 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
     float4 input_values_f = convert_float4(input_values);
     input_values_f        = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
-- 
cgit v1.2.1