From 4b90865ab985d571f70c60583cdfb8c7a65f1670 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 18 Oct 2018 10:21:02 +0100 Subject: COMPMID-1413 - Improve the performance of GEMMLowp with 8 bit dot product on OpenCL COMPMID-1424 - Add dot product support for CLDepthwise QASYMM8 3x3 NHWC non-unit stride With this patch we are able to improve the performance of MobileNet v1-qasymm8 by 37 % Tried to use the dot product instruction in CLDepthwise QASYMM8 3x3 NHWC non-unit stride but I have not seen any benefit (maybe because we have few arithemtic operation and we do not have more load instructions). However Depthwise convolution has been improved by 30% Change-Id: Id768a99c2e53a04276707e427af5d0ec93419ada Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/155082 Tested-by: bsgcomp Reviewed-by: Georgios Pinitas --- .../cl_kernels/depthwise_convolution_quantized.cl | 328 +++++++++++++-------- 1 file changed, 206 insertions(+), 122 deletions(-) (limited to 'src/core/CL/cl_kernels/depthwise_convolution_quantized.cl') diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 3239885abc..421c8b6aab 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -24,7 +24,7 @@ #include "helpers_asymm.h" -#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) #if defined(FUSED_ACTIVATION) #define DATA_TYPE uchar @@ -39,9 +39,9 @@ #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) -#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val); +#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val); #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) -#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3)); +#define ARM_DOT(x, y, val) val += arm_dot((x), (y)); #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) @@ -248,7 +248,16 @@ __kernel void depthwise_convolution_3x3_quantized_nchw( #endif /* CONV_STRIDE_Y == 1 */ #endif /* K_OFFSET != 0 */ +#if defined(REAL_MULTIPLIER) + + values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8); + +#else // defined(REAL_MULTIPLIER) + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + +#endif // defined(REAL_MULTIPLIER) + values0 += (int8)OUTPUT_OFFSET; uchar8 res0 = convert_uchar8_sat(values0); res0 = max(res0, (uchar8)0); @@ -256,8 +265,16 @@ __kernel void depthwise_convolution_3x3_quantized_nchw( vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr); #if CONV_STRIDE_Y == 1 +#if defined(REAL_MULTIPLIER) + + values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8); + +#else // defined(REAL_MULTIPLIER) values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + +#endif // defined(REAL_MULTIPLIER) + values1 += (int8)OUTPUT_OFFSET; uchar8 res1 = convert_uchar8_sat(values1); res1 = max(res1, (uchar8)0); @@ -397,69 +414,69 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ #endif // CONV_STRIDE_Y == 1 - ARM_DOT(left0.s0, middle0.s0, right0.s0, left1.s0, w0.s0, w0.s1, w0.s2, w1.s0, values0.s0); - ARM_DOT(middle1.s0, right1.s0, left2.s0, middle2.s0, w1.s1, w1.s2, w2.s0, w2.s1, values0.s0); + ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0); + ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0); values0.s0 += right2.s0 * w2.s2; - ARM_DOT(left0.s1, middle0.s1, right0.s1, left1.s1, w0.s0, w0.s1, w0.s2, w1.s0, values0.s1); - ARM_DOT(middle1.s1, right1.s1, left2.s1, middle2.s1, w1.s1, w1.s2, w2.s0, w2.s1, values0.s1); + ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1); + ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1); values0.s1 += right2.s1 * w2.s2; - ARM_DOT(left0.s2, middle0.s2, right0.s2, left1.s2, w0.s0, w0.s1, w0.s2, w1.s0, values0.s2); - ARM_DOT(middle1.s2, right1.s2, left2.s2, middle2.s2, w1.s1, w1.s2, w2.s0, w2.s1, values0.s2); + ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2); + ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2); values0.s2 += right2.s2 * w2.s2; - ARM_DOT(left0.s3, middle0.s3, right0.s3, left1.s3, w0.s0, w0.s1, w0.s2, w1.s0, values0.s3); - ARM_DOT(middle1.s3, right1.s3, left2.s3, middle2.s3, w1.s1, w1.s2, w2.s0, w2.s1, values0.s3); + ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3); + ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3); values0.s3 += right2.s3 * w2.s2; - ARM_DOT(left0.s4, middle0.s4, right0.s4, left1.s4, w0.s0, w0.s1, w0.s2, w1.s0, values0.s4); - ARM_DOT(middle1.s4, right1.s4, left2.s4, middle2.s4, w1.s1, w1.s2, w2.s0, w2.s1, values0.s4); + ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4); + ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4); values0.s4 += right2.s4 * w2.s2; - ARM_DOT(left0.s5, middle0.s5, right0.s5, left1.s5, w0.s0, w0.s1, w0.s2, w1.s0, values0.s5); - ARM_DOT(middle1.s5, right1.s5, left2.s5, middle2.s5, w1.s1, w1.s2, w2.s0, w2.s1, values0.s5); + ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5); + ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5); values0.s5 += right2.s5 * w2.s2; - ARM_DOT(left0.s6, middle0.s6, right0.s6, left1.s6, w0.s0, w0.s1, w0.s2, w1.s0, values0.s6); - ARM_DOT(middle1.s6, right1.s6, left2.s6, middle2.s6, w1.s1, w1.s2, w2.s0, w2.s1, values0.s6); + ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6); + ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6); values0.s6 += right2.s6 * w2.s2; - ARM_DOT(left0.s7, middle0.s7, right0.s7, left1.s7, w0.s0, w0.s1, w0.s2, w1.s0, values0.s7); - ARM_DOT(middle1.s7, right1.s7, left2.s7, middle2.s7, w1.s1, w1.s2, w2.s0, w2.s1, values0.s7); + ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7); + ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7); values0.s7 += right2.s7 * w2.s2; #if CONV_STRIDE_Y == 1 - ARM_DOT(left1.s0, middle1.s0, right1.s0, left2.s0, w0.s0, w0.s1, w0.s2, w1.s0, values1.s0); - ARM_DOT(middle2.s0, right2.s0, left3.s0, middle3.s0, w1.s1, w1.s2, w2.s0, w2.s1, values1.s0); + ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0); + ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0); values1.s0 += right3.s0 * w2.s2; - ARM_DOT(left1.s1, middle1.s1, right1.s1, left2.s1, w0.s0, w0.s1, w0.s2, w1.s0, values1.s1); - ARM_DOT(middle2.s1, right2.s1, left3.s1, middle3.s1, w1.s1, w1.s2, w2.s0, w2.s1, values1.s1); + ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1); + ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1); values1.s1 += right3.s1 * w2.s2; - ARM_DOT(left1.s2, middle1.s2, right1.s2, left2.s2, w0.s0, w0.s1, w0.s2, w1.s0, values1.s2); - ARM_DOT(middle2.s2, right2.s2, left3.s2, middle3.s2, w1.s1, w1.s2, w2.s0, w2.s1, values1.s2); + ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2); + ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2); values1.s2 += right3.s2 * w2.s2; - ARM_DOT(left1.s3, middle1.s3, right1.s3, left2.s3, w0.s0, w0.s1, w0.s2, w1.s0, values1.s3); - ARM_DOT(middle2.s3, right2.s3, left3.s3, middle3.s3, w1.s1, w1.s2, w2.s0, w2.s1, values1.s3); + ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3); + ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3); values1.s3 += right3.s3 * w2.s2; - ARM_DOT(left1.s4, middle1.s4, right1.s4, left2.s4, w0.s0, w0.s1, w0.s2, w1.s0, values1.s4); - ARM_DOT(middle2.s4, right2.s4, left3.s4, middle3.s4, w1.s1, w1.s2, w2.s0, w2.s1, values1.s4); + ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4); + ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4); values1.s4 += right3.s4 * w2.s2; - ARM_DOT(left1.s5, middle1.s5, right1.s5, left2.s5, w0.s0, w0.s1, w0.s2, w1.s0, values1.s5); - ARM_DOT(middle2.s5, right2.s5, left3.s5, middle3.s5, w1.s1, w1.s2, w2.s0, w2.s1, values1.s5); + ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5); + ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5); values1.s5 += right3.s5 * w2.s2; - ARM_DOT(left1.s6, middle1.s6, right1.s6, left2.s6, w0.s0, w0.s1, w0.s2, w1.s0, values1.s6); - ARM_DOT(middle2.s6, right2.s6, left3.s6, middle3.s6, w1.s1, w1.s2, w2.s0, w2.s1, values1.s6); + ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6); + ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6); values1.s6 += right3.s6 * w2.s2; - ARM_DOT(left1.s7, middle1.s7, right1.s7, left2.s7, w0.s0, w0.s1, w0.s2, w1.s0, values1.s7); - ARM_DOT(middle2.s7, right2.s7, left3.s7, middle3.s7, w1.s1, w1.s2, w2.s0, w2.s1, values1.s7); + ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7); + ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7); values1.s7 += right3.s7 * w2.s2; #endif // CONV_STRIDE_Y == 1 @@ -494,7 +511,16 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #endif /* CONV_STRIDE_Y == 1 */ #endif /* K_OFFSET != 0 */ +#if defined(REAL_MULTIPLIER) + + values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8); + +#else // defined(REAL_MULTIPLIER) + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + +#endif // defined(REAL_MULTIPLIER) + values0 += (int8)OUTPUT_OFFSET; uchar8 res0 = convert_uchar8_sat(values0); res0 = max(res0, (uchar8)0); @@ -503,7 +529,16 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr); #if CONV_STRIDE_Y == 1 +#if defined(REAL_MULTIPLIER) + + values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8); + +#else // defined(REAL_MULTIPLIER) + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + +#endif // defined(REAL_MULTIPLIER) + values1 += (int8)OUTPUT_OFFSET; uchar8 res1 = convert_uchar8_sat(values1); res1 = max(res1, (uchar8)0); @@ -522,6 +557,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE) #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) #define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) #define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE) @@ -540,33 +576,62 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \ ({ \ - ARM_DOT(val0.s0, val1.s0, val2.s0, val3.s0, w0.s0, w1.s0, w2.s0, w3.s0, acc.s0); \ - ARM_DOT(val4.s0, val5.s0, val6.s0, val7.s0, w4.s0, w5.s0, w6.s0, w7.s0, acc.s0); \ + ARM_DOT((uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), (uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), acc.s0); \ + ARM_DOT((uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), (uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), acc.s0); \ acc.s0 += val8.s0 * w8.s0; \ \ - ARM_DOT(val0.s1, val1.s1, val2.s1, val3.s1, w0.s1, w1.s1, w2.s1, w3.s1, acc.s1); \ - ARM_DOT(val4.s1, val5.s1, val6.s1, val7.s1, w4.s1, w5.s1, w6.s1, w7.s1, acc.s1); \ + ARM_DOT((uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), (uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), acc.s1); \ + ARM_DOT((uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), (uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), acc.s1); \ acc.s1 += val8.s1 * w8.s1; \ \ - ARM_DOT(val0.s2, val1.s2, val2.s2, val3.s2, w0.s2, w1.s2, w2.s2, w3.s2, acc.s2); \ - ARM_DOT(val4.s2, val5.s2, val6.s2, val7.s2, w4.s2, w5.s2, w6.s2, w7.s2, acc.s2); \ + ARM_DOT((uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), (uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), acc.s2); \ + ARM_DOT((uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), (uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), acc.s2); \ acc.s2 += val8.s2 * w8.s2; \ \ - ARM_DOT(val0.s3, val1.s3, val2.s3, val3.s3, w0.s3, w1.s3, w2.s3, w3.s3, acc.s3); \ - ARM_DOT(val4.s3, val5.s3, val6.s3, val7.s3, w4.s3, w5.s3, w6.s3, w7.s3, acc.s3); \ + ARM_DOT((uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), (uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), acc.s3); \ + ARM_DOT((uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), (uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), acc.s3); \ acc.s3 += val8.s3 * w8.s3; \ }) #if WEIGHTS_OFFSET != 0 -#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \ - ({ \ - sum += CONVERT(val0, VEC_INT) + CONVERT(val1, VEC_INT) + CONVERT(val2, VEC_INT) + CONVERT(val3, VEC_INT) + CONVERT(val4, VEC_INT) + CONVERT(val5, VEC_INT) + CONVERT(val6, VEC_INT) + CONVERT(val7, VEC_INT) + CONVERT(val8, VEC_INT); \ - DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8); \ +#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \ + ({ \ + ARM_DOT((uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), (uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), acc.s0); \ + ARM_DOT((uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), (uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), acc.s0); \ + ARM_DOT((uchar4)(w8.s0, 0, 0, 0), (uchar4)val8.s0, acc.s0); \ + \ + ARM_DOT((uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), (uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), acc.s1); \ + ARM_DOT((uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), (uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), acc.s1); \ + ARM_DOT((uchar4)(w8.s1, 0, 0, 0), (uchar4)val8.s1, acc.s1); \ + \ + ARM_DOT((uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), (uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), acc.s2); \ + ARM_DOT((uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), (uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), acc.s2); \ + ARM_DOT((uchar4)(w8.s2, 0, 0, 0), (uchar4)val8.s2, acc.s2); \ + \ + ARM_DOT((uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), (uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), acc.s3); \ + ARM_DOT((uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), (uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), acc.s3); \ + ARM_DOT((uchar4)(w8.s3, 0, 0, 0), (uchar4)val8.s3, acc.s3); \ }) #else /* WEIGHTS_OFFSET != 0 */ -#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) +#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) #endif /* WEIGHTS_OFFSET != 0 */ +#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \ + ({ \ + sum = CONVERT(val0, VEC_INT); \ + ARM_DOT((uchar4)(val1.s0, val2.s0, val3.s0, val4.s0), (uchar4)1, sum.s0); \ + ARM_DOT((uchar4)(val5.s0, val6.s0, val7.s0, val8.s0), (uchar4)1, sum.s0); \ + \ + ARM_DOT((uchar4)(val1.s1, val2.s1, val3.s1, val4.s1), (uchar4)1, sum.s1); \ + ARM_DOT((uchar4)(val5.s1, val6.s1, val7.s1, val8.s1), (uchar4)1, sum.s1); \ + \ + ARM_DOT((uchar4)(val1.s2, val2.s2, val3.s2, val4.s2), (uchar4)1, sum.s2); \ + ARM_DOT((uchar4)(val5.s2, val6.s2, val7.s2, val8.s2), (uchar4)1, sum.s2); \ + \ + ARM_DOT((uchar4)(val1.s3, val2.s3, val3.s3, val4.s3), (uchar4)1, sum.s3); \ + ARM_DOT((uchar4)(val5.s3, val6.s3, val7.s3, val8.s3), (uchar4)1, sum.s3); \ + }) + #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) @@ -626,11 +691,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc( __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; - int z_coord = 0; - int4 offset = 0; - const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y; + int z_coord = 0; + int4 offset = 0; + int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT; - // We compute 2x1x1 [C,W,H] elements + // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1 + y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1); + y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1); + y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1); + y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1); + + int4 y_offset = convert_int4(y_coord * (int)src_stride_y); + + // We compute 4x1x1 [C,W,H] elements VEC_INT acc = 0, sum = 0; // Load weights @@ -712,7 +785,15 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc( acc += (VEC_INT)K_OFFSET; #endif /* K_OFFSET != 0 */ +#if defined(REAL_MULTIPLIER) + + acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); + +#else // defined(REAL_MULTIPLIER) + acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#endif // defined(REAL_MULTIPLIER) + acc += (VEC_INT)OUTPUT_OFFSET; VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR); @@ -782,11 +863,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; - int z_coord = 0; - int4 offset = 0; - int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y; + int z_coord = 0; + int4 offset = 0; + int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT; + + // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1 + y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1); + y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1); + y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1); + y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1); - // We compute 2x2x2 [C,W,H] elements + int4 y_offset = convert_int4(y_coord * (int)src_stride_y); + + // We compute 4x2x2 [C,W,H] elements VEC_INT acc0 = 0, sum0 = 0; VEC_INT acc1 = 0, sum1 = 0; VEC_INT acc2 = 0, sum2 = 0; @@ -930,11 +1019,22 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( acc3 += (VEC_INT)K_OFFSET; #endif /* K_OFFSET != 0 */ +#if defined(REAL_MULTIPLIER) + + acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); + acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); + acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); + acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); + +#else // defined(REAL_MULTIPLIER) + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#endif // defined(REAL_MULTIPLIER) + acc0 += (VEC_INT)OUTPUT_OFFSET; acc1 += (VEC_INT)OUTPUT_OFFSET; acc2 += (VEC_INT)OUTPUT_OFFSET; @@ -977,6 +1077,8 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2) * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1) * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1). + * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication. + * If not, the quantization will be performed using a fixed point multiplication * * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) @@ -1006,6 +1108,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] max_offset The maximum allowed offset for the input tensor */ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( @@ -1014,7 +1117,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( TENSOR3D_DECLARATION(weights), #if defined(HAS_BIAS) VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ +#endif // defined(HAS_BIAS) int max_offset) { int x = get_global_id(0); @@ -1025,15 +1128,23 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; - int z_coord = 0; - int4 offset = 0; - int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y; + int z_coord = 0; + int4 offset = 0; + int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT; - // We compute 2x2x2 [C,W,H] elements - VEC_INT acc0 = 0, sum0 = 0; - VEC_INT acc1 = 0, sum1 = 0; - VEC_INT acc2 = 0, sum2 = 0; - VEC_INT acc3 = 0, sum3 = 0; + // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1 + y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1); + y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1); + y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1); + y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1); + + int4 y_offset = convert_int4(y_coord * (int)src_stride_y); + + // We compute 4x2x1 [C,W,H] elements + VEC_INT acc0 = 0; + VEC_INT acc1 = 0; + VEC_INT sum0 = 0; + VEC_INT sum1 = 0; // Load weights VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z); @@ -1047,17 +1158,21 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z); #if INPUT_OFFSET != 0 - VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) - + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT) - + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT); -#endif /* INPUT_OFFSET != 0 */ + // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET + DOT_PRODUCT_REDUCTION(acc0, w0, w1, w2, w3, w4, w5, w6, w7, w8); + + // Multiply the weights reduction with INPUT_OFFSET + acc0 = INPUT_OFFSET * acc0; + + acc1 = acc0; +#endif // INPUT_OFFSET != 0 // Load input values // z == 0 // Clamp z_coord as for z = 0, it can be negative // z_coord is casted to unsigned int in order to use just a min() operation // A "-1" 32 bit signed variable converted to unsigned gives 4294967295 - z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP; + z_coord = z - (int)CONV_PAD_TOP; z_coord = min((uint)z_coord, (uint)SRC_DIM_2); offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); @@ -1070,7 +1185,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( // z == 1 // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset - z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1; + z_coord = z - (int)CONV_PAD_TOP + 1; offset = y_offset + (int4)(z_coord * src_stride_z); VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); @@ -1087,20 +1202,11 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); - // z == 3 - // After z = 1 we can simply add src_stride_z to offset without updating z_coord - // However offset can be out-of-bound so we need to check if it is greater than max_offset - offset += (int4)(src_stride_z); - offset = min(offset, (int4)max_offset); - VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + DOT_PRODUCT_REDUCTION(sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10); + DOT_PRODUCT_ACCUMULATE(acc0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8); - DOT_PRODUCT_ACCUMULATE(acc0, sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8); - DOT_PRODUCT_ACCUMULATE(acc1, sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8); - DOT_PRODUCT_ACCUMULATE(acc2, sum2, values4, values5, values6, values8, values9, values10, values12, values13, values14, w0, w1, w2, w3, w4, w5, w6, w7, w8); - DOT_PRODUCT_ACCUMULATE(acc3, sum3, values5, values6, values7, values9, values10, values11, values13, values14, values15, w0, w1, w2, w3, w4, w5, w6, w7, w8); + DOT_PRODUCT_REDUCTION(sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11); + DOT_PRODUCT_ACCUMULATE(acc1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8); #if defined(HAS_BIAS) Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); @@ -1109,74 +1215,52 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( acc0 += bias_values; acc1 += bias_values; - acc2 += bias_values; - acc3 += bias_values; -#endif /* defined(HAS_BIAS) */ + +#endif // defined(HAS_BIAS) #if WEIGHTS_OFFSET != 0 acc0 += WEIGHTS_OFFSET * sum0; acc1 += WEIGHTS_OFFSET * sum1; - acc2 += WEIGHTS_OFFSET * sum2; - acc3 += WEIGHTS_OFFSET * sum3; -#endif /* WEIGHTS_OFFSET != 0 */ - -#if INPUT_OFFSET != 0 - VEC_INT offs = INPUT_OFFSET * sum_we; - - acc0 += offs; - acc1 += offs; - acc2 += offs; - acc3 += offs; -#endif /* INPUT_OFFSET != 0 */ +#endif // WEIGHTS_OFFSET != 0 #if K_OFFSET != 0 acc0 += (VEC_INT)K_OFFSET; acc1 += (VEC_INT)K_OFFSET; - acc2 += (VEC_INT)K_OFFSET; - acc3 += (VEC_INT)K_OFFSET; -#endif /* K_OFFSET != 0 */ + +#endif // K_OFFSET != 0 + +#if defined(REAL_MULTIPLIER) + + acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); + acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); + +#else // defined(REAL_MULTIPLIER) acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#endif // defined(REAL_MULTIPLIER) acc0 += (VEC_INT)OUTPUT_OFFSET; acc1 += (VEC_INT)OUTPUT_OFFSET; - acc2 += (VEC_INT)OUTPUT_OFFSET; - acc3 += (VEC_INT)OUTPUT_OFFSET; VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR); VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR); - VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR); - VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR); res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255); res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255); - res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255); - res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255); - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z; VSTORE(VEC_SIZE) (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y); VSTORE(VEC_SIZE) (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y); - -#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0) - if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2) -#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0) - { - VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z); - VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z); - } } + #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) #endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) -#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) -- cgit v1.2.1