aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2018-01-16 15:38:35 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:43:42 +0000
commit944d3f79baef6878916c1ec082a71768f0bf3409 (patch)
tree1dc18a46876aedfbe23ec18f9c43dc28ece97d47 /src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
parent200b6e38b2f66a87dd0e73b6833554d1cab20b26 (diff)
downloadComputeLibrary-944d3f79baef6878916c1ec082a71768f0bf3409.tar.gz
COMPMID-751 Processing 8 elements makes computation up to 80us faster on MobileNet QASYMM8 dwc layers
Change-Id: I30eaea3f3625086e311ad201ef73a8f06a01e382 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116521 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/depthwise_convolution_quantized.cl')
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl65
1 files changed, 33 insertions, 32 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 0cd4e7148e..8a757fc2bd 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,22 +45,23 @@
* @param[in] input_offset Quantized offset of zero point of the input tensor data range
* @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
*
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
*/
-inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_1(__global const uchar *left_pixel,
const int left_coeff,
const int middle_coeff,
const int right_coeff,
const int input_offset,
const int weight_offset)
{
- int4 temp = CONVERT(vload4(0, left_pixel), int4);
+ int8 temp0 = CONVERT(vload8(0, left_pixel), int8);
+ int2 temp1 = CONVERT(vload2(0, (left_pixel + 8 * sizeof(uchar))), int2);
- int2 left = CONVERT(temp.s01, int2);
- int2 middle = CONVERT(temp.s12, int2);
- int2 right = CONVERT(temp.s23, int2);
+ int8 left = CONVERT(temp0.s01234567, int8);
+ int8 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);
+ int8 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);
- return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+ return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
}
/** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type.
@@ -72,23 +73,23 @@ inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
* @param[in] input_offset Quantized offset of zero point of the input tensor data range
* @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
*
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
*/
-inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_2(__global const uchar *left_pixel,
const int left_coeff,
const int middle_coeff,
const int right_coeff,
const int input_offset,
const int weight_offset)
{
- int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
- int temp1 = CONVERT(*(left_pixel + 4 * sizeof(uchar)), int);
+ int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+ int temp1 = CONVERT(*(left_pixel + 16 * sizeof(uchar)), int);
- int2 left = CONVERT(temp0.s02, int2);
- int2 middle = CONVERT(temp0.s13, int2);
- int2 right = CONVERT((int2)(temp0.s2, temp1), int2);
+ int8 left = CONVERT(temp0.s02468ace, int8);
+ int8 middle = CONVERT(temp0.s13579bdf, int8);
+ int8 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);
- return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+ return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
}
/** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type.
@@ -100,23 +101,23 @@ inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
* @param[in] input_offset Quantized offset of zero point of the input tensor data range
* @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
*
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
*/
-inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_3(__global const uchar *left_pixel,
const int left_coeff,
const int middle_coeff,
const int right_coeff,
const int input_offset,
const int weight_offset)
{
- int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
- int2 temp1 = CONVERT(vload2(0, (left_pixel + 4 * sizeof(uchar))), int2);
+ int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+ int8 temp1 = CONVERT(vload8(0, (left_pixel + 16 * sizeof(uchar))), int8);
- int2 left = CONVERT(temp0.s03, int2);
- int2 middle = CONVERT((int2)(temp0.s1, temp1.s0), int2);
- int2 right = CONVERT((int2)(temp0.s2, temp1.s1), int2);
+ int8 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);
+ int8 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);
+ int8 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);
- return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+ return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
}
/** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result.
@@ -144,9 +145,9 @@ inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
* @param[in] output_shift Output scale divisor exponent
* @param[in] bias (Optional) Bias value
*
- * @return a uchar2 containing 2 convoluted values.
+ * @return a uchar8 containing 8 convoluted values.
*/
-inline uchar2 convolution3x3(
+inline uchar8 convolution3x3(
Image *src,
const uchar mat0, const uchar mat1, const uchar mat2,
const uchar mat3, const uchar mat4, const uchar mat5,
@@ -159,20 +160,20 @@ inline uchar2 convolution3x3(
#endif //defined(HAS_BIAS)
)
{
- int2 pixels;
+ int8 pixels;
pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset);
pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset);
pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset);
#if defined(HAS_BIAS)
- pixels += (int2)(bias);
+ pixels += (int8)(bias);
#endif //defined(HAS_BIAS)
- pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 2);
+ pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 8);
pixels = pixels + output_offset;
pixels = clamp(pixels, 0, 255);
- return CONVERT(pixels, uchar2);
+ return CONVERT(pixels, uchar8);
}
/** This function computes the horizontal integral of the image.
@@ -241,7 +242,7 @@ __kernel void depthwise_convolution_3x3_quantized(
int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
#endif //defined(HAS_BIAS)
- uchar2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+ uchar8 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
weights_values1.s0, weights_values1.s1, weights_values1.s2,
weights_values2.s0, weights_values2.s1, weights_values2.s2,
input_offset, weight_offset, output_offset,
@@ -252,7 +253,7 @@ __kernel void depthwise_convolution_3x3_quantized(
#endif //defined(HAS_BIAS)
);
- vstore2(pixels, 0, dst.ptr);
+ vstore8(pixels, 0, dst.ptr);
}
#endif //defined(CONV_STRIDE_X)