From 944d3f79baef6878916c1ec082a71768f0bf3409 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Tue, 16 Jan 2018 15:38:35 +0000
Subject: COMPMID-751 Processing 8 elements makes computation up to 80us faster
 on MobileNet QASYMM8 dwc layers

Change-Id: I30eaea3f3625086e311ad201ef73a8f06a01e382
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116521
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../cl_kernels/depthwise_convolution_quantized.cl  | 65 +++++++++++-----------
 1 file changed, 33 insertions(+), 32 deletions(-)

(limited to 'src/core/CL/cl_kernels/depthwise_convolution_quantized.cl')

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 0cd4e7148e..8a757fc2bd 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,22 +45,23 @@
  * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
  * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
  *
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
  */
-inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_1(__global const uchar *left_pixel,
                                     const int             left_coeff,
                                     const int             middle_coeff,
                                     const int             right_coeff,
                                     const int             input_offset,
                                     const int             weight_offset)
 {
-    int4 temp = CONVERT(vload4(0, left_pixel), int4);
+    int8 temp0 = CONVERT(vload8(0, left_pixel), int8);
+    int2 temp1 = CONVERT(vload2(0, (left_pixel + 8 * sizeof(uchar))), int2);
 
-    int2 left   = CONVERT(temp.s01, int2);
-    int2 middle = CONVERT(temp.s12, int2);
-    int2 right  = CONVERT(temp.s23, int2);
+    int8 left   = CONVERT(temp0.s01234567, int8);
+    int8 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);
+    int8 right  = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);
 
-    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type.
@@ -72,23 +73,23 @@ inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
  * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
  * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
  *
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
  */
-inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_2(__global const uchar *left_pixel,
                                     const int             left_coeff,
                                     const int             middle_coeff,
                                     const int             right_coeff,
                                     const int             input_offset,
                                     const int             weight_offset)
 {
-    int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
-    int  temp1 = CONVERT(*(left_pixel + 4 * sizeof(uchar)), int);
+    int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+    int   temp1 = CONVERT(*(left_pixel + 16 * sizeof(uchar)), int);
 
-    int2 left   = CONVERT(temp0.s02, int2);
-    int2 middle = CONVERT(temp0.s13, int2);
-    int2 right  = CONVERT((int2)(temp0.s2, temp1), int2);
+    int8 left   = CONVERT(temp0.s02468ace, int8);
+    int8 middle = CONVERT(temp0.s13579bdf, int8);
+    int8 right  = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);
 
-    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type.
@@ -100,23 +101,23 @@ inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
  * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
  * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
  *
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
  */
-inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_3(__global const uchar *left_pixel,
                                     const int             left_coeff,
                                     const int             middle_coeff,
                                     const int             right_coeff,
                                     const int             input_offset,
                                     const int             weight_offset)
 {
-    int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
-    int2 temp1 = CONVERT(vload2(0, (left_pixel + 4 * sizeof(uchar))), int2);
+    int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+    int8  temp1 = CONVERT(vload8(0, (left_pixel + 16 * sizeof(uchar))), int8);
 
-    int2 left   = CONVERT(temp0.s03, int2);
-    int2 middle = CONVERT((int2)(temp0.s1, temp1.s0), int2);
-    int2 right  = CONVERT((int2)(temp0.s2, temp1.s1), int2);
+    int8 left   = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);
+    int8 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);
+    int8 right  = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);
 
-    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
 }
 
 /** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result.
@@ -144,9 +145,9 @@ inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
  * @param[in] output_shift      Output scale divisor exponent
  * @param[in] bias              (Optional) Bias value
  *
- * @return a uchar2 containing 2 convoluted values.
+ * @return a uchar8 containing 8 convoluted values.
  */
-inline uchar2 convolution3x3(
+inline uchar8 convolution3x3(
     Image      *src,
     const uchar mat0, const uchar mat1, const uchar mat2,
     const uchar mat3, const uchar mat4, const uchar mat5,
@@ -159,20 +160,20 @@ inline uchar2 convolution3x3(
 #endif //defined(HAS_BIAS)
 )
 {
-    int2 pixels;
+    int8 pixels;
 
     pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset);
     pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset);
     pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset);
 #if defined(HAS_BIAS)
-    pixels += (int2)(bias);
+    pixels += (int8)(bias);
 #endif //defined(HAS_BIAS)
 
-    pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 2);
+    pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 8);
     pixels = pixels + output_offset;
     pixels = clamp(pixels, 0, 255);
 
-    return CONVERT(pixels, uchar2);
+    return CONVERT(pixels, uchar8);
 }
 
 /** This function computes the horizontal integral of the image.
@@ -241,7 +242,7 @@ __kernel void depthwise_convolution_3x3_quantized(
     int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
 #endif //defined(HAS_BIAS)
 
-    uchar2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+    uchar8 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
                                    weights_values1.s0, weights_values1.s1, weights_values1.s2,
                                    weights_values2.s0, weights_values2.s1, weights_values2.s2,
                                    input_offset, weight_offset, output_offset,
@@ -252,7 +253,7 @@ __kernel void depthwise_convolution_3x3_quantized(
 #endif //defined(HAS_BIAS)
                                   );
 
-    vstore2(pixels, 0, dst.ptr);
+    vstore8(pixels, 0, dst.ptr);
 }
 
 #endif //defined(CONV_STRIDE_X)
-- 
cgit v1.2.1