From 287b570b86ba40a801136aded140b83435ca9314 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Fri, 16 Feb 2018 11:01:04 +0000 Subject: COMPMID-853 Use tile 2 for CL depthwise convolution QASYM8 Change-Id: I91f6a0b057f5eb84c6ac7db5abbc05c7520ed5d2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120760 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../cl_kernels/depthwise_convolution_quantized.cl | 328 ++++++++++----------- 1 file changed, 154 insertions(+), 174 deletions(-) (limited to 'src/core/CL/cl_kernels/depthwise_convolution_quantized.cl') diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index e4345817fc..b2527a4c7d 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -24,158 +24,45 @@ #include "helpers_asymm.h" -#if defined(CONV_STRIDE_X) +#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) + +#if CONV_STRIDE_X > 3 +#error "Stride X not supported" +#endif /* CONV_STRIDE_X > 3 */ #if CONV_STRIDE_X == 1 -#define convolution1x3 convolution1x3_stride_1 +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int8 temp0 = CONVERT(vload8(0, first_value), int8); \ + int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \ + \ + left = CONVERT(temp0.s01234567, int8); \ + middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \ + right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \ + }) #elif CONV_STRIDE_X == 2 -#define convolution1x3 convolution1x3_stride_2 -#elif CONV_STRIDE_X == 3 -#define convolution1x3 convolution1x3_stride_3 +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, first_value), int16); \ + int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \ + \ + left = CONVERT(temp0.s02468ace, int8); \ + middle = CONVERT(temp0.s13579bdf, int8); \ + right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \ + }) #else /* CONV_STRIDE_X */ -#error "Stride not supported" +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, first_value), int16); \ + int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \ + \ + left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ + middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \ + right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \ + }) #endif /* CONV_STRIDE_X */ -/** Compute a 1D horizontal convolution of size 3 and stride 1 for uchar type. - * - * @param[in] left_pixel Pointer to the left pixel. - * @param[in] left_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right_coeff Weight of the right pixel - * @param[in] input_offset Quantized offset of zero point of the input tensor data range - * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range - * - * @return a int8 containing 8 convoluted values. - */ -inline int8 convolution1x3_stride_1(__global const uchar *left_pixel, - const int left_coeff, - const int middle_coeff, - const int right_coeff, - const int input_offset, - const int weight_offset) -{ - int8 temp0 = CONVERT(vload8(0, left_pixel), int8); - int2 temp1 = CONVERT(vload2(0, (left_pixel + 8 * sizeof(uchar))), int2); - - int8 left = CONVERT(temp0.s01234567, int8); - int8 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); - int8 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); - - return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset); -} - -/** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type. - * - * @param[in] left_pixel Pointer to the left pixel. - * @param[in] left_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right_coeff Weight of the right pixel - * @param[in] input_offset Quantized offset of zero point of the input tensor data range - * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range - * - * @return a int8 containing 8 convoluted values. - */ -inline int8 convolution1x3_stride_2(__global const uchar *left_pixel, - const int left_coeff, - const int middle_coeff, - const int right_coeff, - const int input_offset, - const int weight_offset) -{ - int16 temp0 = CONVERT(vload16(0, left_pixel), int16); - int temp1 = CONVERT(*(left_pixel + 16 * sizeof(uchar)), int); - - int8 left = CONVERT(temp0.s02468ace, int8); - int8 middle = CONVERT(temp0.s13579bdf, int8); - int8 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); - - return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset); -} - -/** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type. - * - * @param[in] left_pixel Pointer to the left pixel. - * @param[in] left_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right_coeff Weight of the right pixel - * @param[in] input_offset Quantized offset of zero point of the input tensor data range - * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range - * - * @return a int8 containing 8 convoluted values. - */ -inline int8 convolution1x3_stride_3(__global const uchar *left_pixel, - const int left_coeff, - const int middle_coeff, - const int right_coeff, - const int input_offset, - const int weight_offset) -{ - int16 temp0 = CONVERT(vload16(0, left_pixel), int16); - int8 temp1 = CONVERT(vload8(0, (left_pixel + 16 * sizeof(uchar))), int8); - - int8 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); - int8 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); - int8 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); - - return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset); -} - -/** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result. - * - * Convolution matrix layout: - * - * [ mat0, mat1, mat2 ]\n - * [ mat3, mat4, mat5 ]\n - * [ mat6, mat7, mat8 ]\n - * - * @param[in] src A pointer to source Image structure - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] input_offset Quantized offset of zero point of the input tensor data range - * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range - * @param[in] output_offset Quantized offset of zero point of the output tensor data range - * @param[in] output_multiplier Output scale multiplier - * @param[in] output_shift Output scale divisor exponent - * @param[in] bias (Optional) Bias value - * - * @return a uchar8 containing 8 convoluted values. - */ -inline uchar8 convolution3x3( - Image *src, - const uchar mat0, const uchar mat1, const uchar mat2, - const uchar mat3, const uchar mat4, const uchar mat5, - const uchar mat6, const uchar mat7, const uchar mat8, - const int input_offset, const int weight_offset, const int output_offset, - const int output_multiplier, const int output_shift -#if defined(HAS_BIAS) - , - const int bias -#endif //defined(HAS_BIAS) -) -{ - int8 pixels; - - pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset); - pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset); - pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset); -#if defined(HAS_BIAS) - pixels += (int8)(bias); -#endif //defined(HAS_BIAS) - - pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 8); - pixels = pixels + output_offset; - - return CONVERT_SAT(pixels, uchar8); -} - -/** This function computes the horizontal integral of the image. +/** This function computes the horizontal integral of the image and adds offsets. * * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) @@ -205,11 +92,6 @@ inline uchar8 convolution3x3( * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector - * @param[in] input_offset Quantized offset of zero point of the input tensor data range - * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range - * @param[in] output_offset Quantized offset of zero point of the output tensor data range - * @param[in] output_multiplier Output scale multiplier - * @param[in] output_shift Output scale divisor exponent */ __kernel void depthwise_convolution_3x3_quantized( @@ -217,41 +99,139 @@ __kernel void depthwise_convolution_3x3_quantized( TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights), #if defined(HAS_BIAS) - VECTOR_DECLARATION(biases), + VECTOR_DECLARATION(biases) #endif //defined(HAS_BIAS) - int input_offset, - int weight_offset, - int output_offset, - int output_multiplier, - int output_shift) +) { Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights); #if defined(HAS_BIAS) Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); -#endif //defined(HAS_BIAS) - - uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y; - uchar3 weights_values0 = vload3(0, weights.ptr + offset.s0); - uchar3 weights_values1 = vload3(0, weights.ptr + offset.s1); - uchar3 weights_values2 = vload3(0, weights.ptr + offset.s2); -#if defined(HAS_BIAS) int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2)))); #endif //defined(HAS_BIAS) - uchar8 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2, - weights_values1.s0, weights_values1.s1, weights_values1.s2, - weights_values2.s0, weights_values2.s1, weights_values2.s2, - input_offset, weight_offset, output_offset, - output_multiplier, output_shift + uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y); + uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y); + uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y); + + int8 values0 = 0; + int8 sum0 = 0; +#if CONV_STRIDE_Y == 1 + int8 values1 = 0; + int8 sum1 = 0; +#endif /* CONV_STRIDE_Y */ + + // Row0 + int8 left, middle, right; + GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right); + values0 += left * (int8)(w0.s0); + values0 += middle * (int8)(w0.s1); + values0 += right * (int8)(w0.s2); + +#if WEIGHTS_OFFSET != 0 + sum0 += left + middle + right; +#endif /* WEIGHTS_OFFSET != 0 */ + + // Row1 + GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right); + values0 += left * (int8)(w1.s0); + values0 += middle * (int8)(w1.s1); + values0 += right * (int8)(w1.s2); +#if CONV_STRIDE_Y == 1 + values1 += left * (int8)(w0.s0); + values1 += middle * (int8)(w0.s1); + values1 += right * (int8)(w0.s2); +#endif /* CONV_STRIDE_Y == 1 */ + +#if WEIGHTS_OFFSET != 0 + int8 tmp = left + middle + right; + sum0 += tmp; +#if CONV_STRIDE_Y == 1 + sum1 += tmp; +#endif /* CONV_STRIDE_Y == 1 */ +#endif /* WEIGHTS_OFFSET != 0 */ + + // Row2 + GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right); + values0 += left * (int8)(w2.s0); + values0 += middle * (int8)(w2.s1); + values0 += right * (int8)(w2.s2); +#if CONV_STRIDE_Y == 1 + values1 += left * (int8)(w1.s0); + values1 += middle * (int8)(w1.s1); + values1 += right * (int8)(w1.s2); +#endif /* CONV_STRIDE_Y == 1 */ + +#if WEIGHTS_OFFSET != 0 + tmp = left + middle + right; + sum0 += tmp; +#if CONV_STRIDE_Y == 1 + sum1 += tmp; +#endif /* CONV_STRIDE_Y == 1 */ +#endif /* WEIGHTS_OFFSET != 0 */ + +#if CONV_STRIDE_Y == 1 + // Row3 + GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right); + values1 += left * (int8)(w2.s0); + values1 += middle * (int8)(w2.s1); + values1 += right * (int8)(w2.s2); + +#if WEIGHTS_OFFSET != 0 + sum1 += left + middle + right; +#endif /* WEIGHTS_OFFSET != 0 */ +#endif /* CONV_STRIDE_Y == 1 */ + #if defined(HAS_BIAS) - , - bias_value + values0 += (int8)(bias_value); +#if CONV_STRIDE_Y == 1 + values1 += (int8)(bias_value); +#endif /* CONV_STRIDE_Y == 1 */ #endif //defined(HAS_BIAS) - ); - vstore8(pixels, 0, dst.ptr); +#if WEIGHTS_OFFSET != 0 + values0 += sum0 * (int8)(WEIGHTS_OFFSET); +#if CONV_STRIDE_Y == 1 + values1 += sum1 * (int8)(WEIGHTS_OFFSET); +#endif /* CONV_STRIDE_Y == 1 */ +#endif /* WEIGHTS_OFFSET != 0 */ + +#if INPUT_OFFSET != 0 + ushort sum_weights = 0; + ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2); + sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2; + values0 += sum_weights * (int8)(INPUT_OFFSET); +#if CONV_STRIDE_Y == 1 + values1 += sum_weights * (int8)(INPUT_OFFSET); +#endif /* CONV_STRIDE_Y == 1 */ +#endif /* INPUT_OFFSET != 0 */ + +#if K_OFFSET != 0 + values0 += (int8)(K_OFFSET); +#if CONV_STRIDE_Y == 1 + values1 += (int8)(K_OFFSET); +#endif /* CONV_STRIDE_Y == 1 */ +#endif /* K_OFFSET != 0 */ + + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values0 += (int8)OUTPUT_OFFSET; + uchar8 res0 = convert_uchar8_sat(values0); + res0 = max(res0, (uchar8)0); + res0 = min(res0, (uchar8)255); + + vstore8(res0, 0, dst.ptr); +#if CONV_STRIDE_Y == 1 + + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values1 += (int8)OUTPUT_OFFSET; + uchar8 res1 = convert_uchar8_sat(values1); + res1 = max(res1, (uchar8)0); + res1 = min(res1, (uchar8)255); + + vstore8(res1, 0, dst.ptr + dst_stride_y); +#endif /* CONV_STRIDE_Y == 1 */ } -#endif //defined(CONV_STRIDE_X) \ No newline at end of file + +#endif /* defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */ -- cgit v1.2.1