From 944d3f79baef6878916c1ec082a71768f0bf3409 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Tue, 16 Jan 2018 15:38:35 +0000 Subject: COMPMID-751 Processing 8 elements makes computation up to 80us faster on MobileNet QASYMM8 dwc layers Change-Id: I30eaea3f3625086e311ad201ef73a8f06a01e382 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116521 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- .../cl_kernels/depthwise_convolution_quantized.cl | 65 +++++++++++----------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'src/core/CL/cl_kernels/depthwise_convolution_quantized.cl') diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 0cd4e7148e..8a757fc2bd 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,22 +45,23 @@ * @param[in] input_offset Quantized offset of zero point of the input tensor data range * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range * - * @return a int2 containing 2 convoluted values. + * @return a int8 containing 8 convoluted values. */ -inline int2 convolution1x3_stride_1(__global const uchar *left_pixel, +inline int8 convolution1x3_stride_1(__global const uchar *left_pixel, const int left_coeff, const int middle_coeff, const int right_coeff, const int input_offset, const int weight_offset) { - int4 temp = CONVERT(vload4(0, left_pixel), int4); + int8 temp0 = CONVERT(vload8(0, left_pixel), int8); + int2 temp1 = CONVERT(vload2(0, (left_pixel + 8 * sizeof(uchar))), int2); - int2 left = CONVERT(temp.s01, int2); - int2 middle = CONVERT(temp.s12, int2); - int2 right = CONVERT(temp.s23, int2); + int8 left = CONVERT(temp0.s01234567, int8); + int8 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); + int8 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); - return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset); + return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset); } /** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type. @@ -72,23 +73,23 @@ inline int2 convolution1x3_stride_1(__global const uchar *left_pixel, * @param[in] input_offset Quantized offset of zero point of the input tensor data range * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range * - * @return a int2 containing 2 convoluted values. + * @return a int8 containing 8 convoluted values. */ -inline int2 convolution1x3_stride_2(__global const uchar *left_pixel, +inline int8 convolution1x3_stride_2(__global const uchar *left_pixel, const int left_coeff, const int middle_coeff, const int right_coeff, const int input_offset, const int weight_offset) { - int4 temp0 = CONVERT(vload4(0, left_pixel), int4); - int temp1 = CONVERT(*(left_pixel + 4 * sizeof(uchar)), int); + int16 temp0 = CONVERT(vload16(0, left_pixel), int16); + int temp1 = CONVERT(*(left_pixel + 16 * sizeof(uchar)), int); - int2 left = CONVERT(temp0.s02, int2); - int2 middle = CONVERT(temp0.s13, int2); - int2 right = CONVERT((int2)(temp0.s2, temp1), int2); + int8 left = CONVERT(temp0.s02468ace, int8); + int8 middle = CONVERT(temp0.s13579bdf, int8); + int8 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); - return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset); + return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset); } /** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type. @@ -100,23 +101,23 @@ inline int2 convolution1x3_stride_2(__global const uchar *left_pixel, * @param[in] input_offset Quantized offset of zero point of the input tensor data range * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range * - * @return a int2 containing 2 convoluted values. + * @return a int8 containing 8 convoluted values. */ -inline int2 convolution1x3_stride_3(__global const uchar *left_pixel, +inline int8 convolution1x3_stride_3(__global const uchar *left_pixel, const int left_coeff, const int middle_coeff, const int right_coeff, const int input_offset, const int weight_offset) { - int4 temp0 = CONVERT(vload4(0, left_pixel), int4); - int2 temp1 = CONVERT(vload2(0, (left_pixel + 4 * sizeof(uchar))), int2); + int16 temp0 = CONVERT(vload16(0, left_pixel), int16); + int8 temp1 = CONVERT(vload8(0, (left_pixel + 16 * sizeof(uchar))), int8); - int2 left = CONVERT(temp0.s03, int2); - int2 middle = CONVERT((int2)(temp0.s1, temp1.s0), int2); - int2 right = CONVERT((int2)(temp0.s2, temp1.s1), int2); + int8 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); + int8 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); + int8 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); - return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset); + return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset); } /** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result. @@ -144,9 +145,9 @@ inline int2 convolution1x3_stride_3(__global const uchar *left_pixel, * @param[in] output_shift Output scale divisor exponent * @param[in] bias (Optional) Bias value * - * @return a uchar2 containing 2 convoluted values. + * @return a uchar8 containing 8 convoluted values. */ -inline uchar2 convolution3x3( +inline uchar8 convolution3x3( Image *src, const uchar mat0, const uchar mat1, const uchar mat2, const uchar mat3, const uchar mat4, const uchar mat5, @@ -159,20 +160,20 @@ inline uchar2 convolution3x3( #endif //defined(HAS_BIAS) ) { - int2 pixels; + int8 pixels; pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset); pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset); pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset); #if defined(HAS_BIAS) - pixels += (int2)(bias); + pixels += (int8)(bias); #endif //defined(HAS_BIAS) - pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 2); + pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 8); pixels = pixels + output_offset; pixels = clamp(pixels, 0, 255); - return CONVERT(pixels, uchar2); + return CONVERT(pixels, uchar8); } /** This function computes the horizontal integral of the image. @@ -241,7 +242,7 @@ __kernel void depthwise_convolution_3x3_quantized( int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2)))); #endif //defined(HAS_BIAS) - uchar2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2, + uchar8 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2, weights_values1.s0, weights_values1.s1, weights_values1.s2, weights_values2.s0, weights_values2.s1, weights_values2.s2, input_offset, weight_offset, output_offset, @@ -252,7 +253,7 @@ __kernel void depthwise_convolution_3x3_quantized( #endif //defined(HAS_BIAS) ); - vstore2(pixels, 0, dst.ptr); + vstore8(pixels, 0, dst.ptr); } #endif //defined(CONV_STRIDE_X) -- cgit v1.2.1