From 3394f3e3df7fd2d924c41822a8564493fc06473a Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 16 Sep 2022 14:14:21 +0100 Subject: Rework direct convolution heuristic on OpenCL Resolves COMPMID-5634 Change-Id: I075de70d509d0c4430b4bcf3f218384e237a3a56 Signed-off-by: Gian Marco Iodice Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/453708 Tested-by: bsgcomp Reviewed-by: Viet-Hoa Do Comments-Addressed: bsgcomp Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8473 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- src/core/CL/cl_kernels/nhwc/direct_convolution.cl | 66 ++++++++++++----------- src/core/CL/cl_kernels/tile_helpers.h | 11 ++++ 2 files changed, 46 insertions(+), 31 deletions(-) (limited to 'src/core/CL/cl_kernels') diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index e602fbb525..2e7ed5a4ca 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -116,12 +116,12 @@ __kernel void direct_convolution_nhwc( // In case of dynamic tensor support, the following dimensions should be passed as function argument. #define _IWEI_WIDTH WEI_WIDTH #define _IWEI_HEIGHT WEI_HEIGHT -#define _ISRC_WIDTH src_w -#define _ISRC_HEIGHT src_h -#define _ISRC_CHANNELS src_c -#define _IDST_WIDTH dst_w -#define _IDST_HEIGHT dst_h -#define _IDST_CHANNELS dst_c +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _ISRC_CHANNELS SRC_CHANNELS +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _IDST_CHANNELS DST_CHANNELS #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) // If quantized, the output tile has to be quantized first before being stored to global memory @@ -159,12 +159,25 @@ __kernel void direct_convolution_nhwc( for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { - int ck = 0; int xk = i % _IWEI_WIDTH; int yk = i / _IWEI_WIDTH; - int k = 0; - for(; k <= (_ISRC_CHANNELS - K0); k += K0) + TILE(int, M0, 1, my); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + int x_s = xi[i].v + xk; + int y_s = yi[i].v + yk; + my[i].v = x_s + y_s *_ISRC_WIDTH; + my[i].v = my[i].v + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT); + my[i].v = select(-1, my[i].v, x_s >= 0); + my[i].v = select(-1, my[i].v, x_s < _ISRC_WIDTH); + my[i].v = select(-1, my[i].v, y_s >= 0); + my[i].v = select(-1, my[i].v, y_s < _ISRC_HEIGHT); + }) + + int ck = 0; + for(; ck <= (_ISRC_CHANNELS - K0); ck += K0) { TILE(SRC_DATA_TYPE, M0, K0, a); TILE(WEI_DATA_TYPE, N0, K0, b); @@ -175,13 +188,8 @@ __kernel void direct_convolution_nhwc( a[i].v = ZERO_VALUE; }) - LOOP_UNROLLING(int, i, 0, 1, N0, - { - b[i].v = ZERO_VALUE; - }) - // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, my, a); // Load tile from the weights tensor T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); @@ -192,15 +200,13 @@ __kernel void direct_convolution_nhwc( // Apply the offset correction (correction usually needed for asymmetric quantized computation) // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c); - - ck += K0; } // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS // This #if directive should be removed in case of dynamic tensor support #if defined(LEFTOVER_LOOP) // Left-over accumulations - for(; k < _ISRC_CHANNELS; ++k) + for(; ck < _ISRC_CHANNELS; ++ck) { TILE(SRC_DATA_TYPE, M0, 1, a); TILE(WEI_DATA_TYPE, N0, 1, b); @@ -229,8 +235,6 @@ __kernel void direct_convolution_nhwc( // Apply the offset correction (operation usually needed for asymmetric quantized computation) // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); - - ++ck; } #endif // defined(LEFTOVER_LOOP) } @@ -249,17 +253,6 @@ __kernel void direct_convolution_nhwc( #endif // HAS_BIAS - TILE(uint, M0, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, 1, M0, - { - dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); - dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); - }) - - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - #if defined(IS_QUANTIZED) TILE(DST_DATA_TYPE, M0, N0, cq); @@ -271,6 +264,17 @@ __kernel void direct_convolution_nhwc( // Apply activation T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE); + TILE(uint, M0, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); + dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8 // Store the tile in reverse order so the invalid values are overwritten with the valid ones T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y); diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 4b6144a22d..6279fb4fb6 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -653,6 +653,17 @@ }) \ }) +#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, yi, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ + { \ + if(yi[_i].v >= 0) \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[_i].v, STRIDE_Y); \ + } \ + }) \ + }) + /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates * * @param[in] DATA_TYPE Data type -- cgit v1.2.1