From 3394f3e3df7fd2d924c41822a8564493fc06473a Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 16 Sep 2022 14:14:21 +0100 Subject: Rework direct convolution heuristic on OpenCL Resolves COMPMID-5634 Change-Id: I075de70d509d0c4430b4bcf3f218384e237a3a56 Signed-off-by: Gian Marco Iodice Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/453708 Tested-by: bsgcomp Reviewed-by: Viet-Hoa Do Comments-Addressed: bsgcomp Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8473 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- src/core/CL/DefaultLWSHeuristics.cpp | 9 ++- src/core/CL/cl_kernels/nhwc/direct_convolution.cl | 66 ++++++++++++---------- src/core/CL/cl_kernels/tile_helpers.h | 11 ++++ src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 6 ++ .../ClDirectConvDefaultConfigValhall.cpp | 64 +++++++++++++++++---- 5 files changed, 113 insertions(+), 43 deletions(-) diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp index c739b9dc03..a53fdbbab6 100644 --- a/src/core/CL/DefaultLWSHeuristics.cpp +++ b/src/core/CL/DefaultLWSHeuristics.cpp @@ -61,7 +61,14 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z) if(gws_x < gws_y) { - return cl::NDRange(4, 16, 1); + if(gws_x < 4) + { + return cl::NDRange(std::min(gws_x, static_cast(2u)), 32, 1); + } + else + { + return cl::NDRange(std::min(gws_x, static_cast(4u)), 8, 1); + } } else { diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index e602fbb525..2e7ed5a4ca 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -116,12 +116,12 @@ __kernel void direct_convolution_nhwc( // In case of dynamic tensor support, the following dimensions should be passed as function argument. #define _IWEI_WIDTH WEI_WIDTH #define _IWEI_HEIGHT WEI_HEIGHT -#define _ISRC_WIDTH src_w -#define _ISRC_HEIGHT src_h -#define _ISRC_CHANNELS src_c -#define _IDST_WIDTH dst_w -#define _IDST_HEIGHT dst_h -#define _IDST_CHANNELS dst_c +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _ISRC_CHANNELS SRC_CHANNELS +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _IDST_CHANNELS DST_CHANNELS #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) // If quantized, the output tile has to be quantized first before being stored to global memory @@ -159,12 +159,25 @@ __kernel void direct_convolution_nhwc( for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { - int ck = 0; int xk = i % _IWEI_WIDTH; int yk = i / _IWEI_WIDTH; - int k = 0; - for(; k <= (_ISRC_CHANNELS - K0); k += K0) + TILE(int, M0, 1, my); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + int x_s = xi[i].v + xk; + int y_s = yi[i].v + yk; + my[i].v = x_s + y_s *_ISRC_WIDTH; + my[i].v = my[i].v + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT); + my[i].v = select(-1, my[i].v, x_s >= 0); + my[i].v = select(-1, my[i].v, x_s < _ISRC_WIDTH); + my[i].v = select(-1, my[i].v, y_s >= 0); + my[i].v = select(-1, my[i].v, y_s < _ISRC_HEIGHT); + }) + + int ck = 0; + for(; ck <= (_ISRC_CHANNELS - K0); ck += K0) { TILE(SRC_DATA_TYPE, M0, K0, a); TILE(WEI_DATA_TYPE, N0, K0, b); @@ -175,13 +188,8 @@ __kernel void direct_convolution_nhwc( a[i].v = ZERO_VALUE; }) - LOOP_UNROLLING(int, i, 0, 1, N0, - { - b[i].v = ZERO_VALUE; - }) - // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, my, a); // Load tile from the weights tensor T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); @@ -192,15 +200,13 @@ __kernel void direct_convolution_nhwc( // Apply the offset correction (correction usually needed for asymmetric quantized computation) // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c); - - ck += K0; } // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS // This #if directive should be removed in case of dynamic tensor support #if defined(LEFTOVER_LOOP) // Left-over accumulations - for(; k < _ISRC_CHANNELS; ++k) + for(; ck < _ISRC_CHANNELS; ++ck) { TILE(SRC_DATA_TYPE, M0, 1, a); TILE(WEI_DATA_TYPE, N0, 1, b); @@ -229,8 +235,6 @@ __kernel void direct_convolution_nhwc( // Apply the offset correction (operation usually needed for asymmetric quantized computation) // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); - - ++ck; } #endif // defined(LEFTOVER_LOOP) } @@ -249,17 +253,6 @@ __kernel void direct_convolution_nhwc( #endif // HAS_BIAS - TILE(uint, M0, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, 1, M0, - { - dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); - dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); - }) - - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - #if defined(IS_QUANTIZED) TILE(DST_DATA_TYPE, M0, N0, cq); @@ -271,6 +264,17 @@ __kernel void direct_convolution_nhwc( // Apply activation T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE); + TILE(uint, M0, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); + dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8 // Store the tile in reverse order so the invalid values are overwritten with the valid ones T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y); diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 4b6144a22d..6279fb4fb6 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -653,6 +653,17 @@ }) \ }) +#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, yi, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ + { \ + if(yi[_i].v >= 0) \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[_i].v, STRIDE_Y); \ + } \ + }) \ + }) + /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates * * @param[in] DATA_TYPE Data type diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index fd14f009e1..781627117a 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -242,6 +242,12 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER"); build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0))); + build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1))); + build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(2))); + build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0))); + build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1))); + build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2))); build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type)); build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); diff --git a/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp index ad94678335..b693568c67 100644 --- a/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp +++ b/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp @@ -144,32 +144,45 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( const int32_t ofm = dst_shape[0]; const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; desc.export_weights_to_cl_image = export_weights_to_cl_image; if(dst_shape[0] <= 4) { + // k0 should be as larger as possible. However, we should avoid + // having left-over for loops that make the implementation slower. + if((k % 16) == 0) + { + desc.k0 = 16; + } + else if((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + if(is_pointwise) { if(ofm == 4) { desc.m0 = 1; desc.n0 = 4; - desc.k0 = 16; } else { desc.m0 = 1; desc.n0 = 1; - desc.k0 = 16; } } else { desc.m0 = 1; desc.n0 = dst_shape[0]; - desc.k0 = 16; } } else @@ -178,21 +191,50 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( { desc.m0 = 1; desc.n0 = 1; - desc.k0 = 16; + if((k % 16) == 0) + { + desc.k0 = 16; + } + else if((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } } else { - if(ofm > 16) + if(ofm >= 16) { - desc.m0 = 4; - desc.n0 = 4; - desc.k0 = 8; + if(m / 6 > 24000) + { + desc.m0 = 6; + } + else + { + desc.m0 = 5; + } + desc.n0 = 8; + desc.k0 = 4; } else { - desc.m0 = 4; - desc.n0 = 4; - desc.k0 = 16; + desc.m0 = 2; + desc.n0 = 8; + if((k % 16) == 0) + { + desc.k0 = 16; + } + else if((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } } } } -- cgit v1.2.1