From 8d07127f5c8f0e189ee0db4feb88c0c0b47608d5 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Tue, 7 Dec 2021 13:49:10 +0000 Subject: Use #if directive instead of regular condition in CLDirectConv2D Resolve COMPMID-5004 Signed-off-by: Giorgio Arena Change-Id: Ib3e1b5a891234316c411ea9825ec10c68c4ab5a3 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6788 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Sheri Zhang --- src/core/CL/cl_kernels/nhwc/direct_convolution.cl | 43 +++++++++++------------ src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 1 + src/gpu/cl/operators/ClConv2d.cpp | 9 ++--- 3 files changed, 24 insertions(+), 29 deletions(-) diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index 35ff86a4fb..f1b422a68f 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -192,36 +192,35 @@ __kernel void direct_convolution_nhwc( // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS // This #if directive should be removed in case of dynamic tensor support - if((_ISRC_CHANNELS % K0) != 0) +#if defined(LEFTOVER_LOOP) + // Left-over accumulations + for(; k < _ISRC_CHANNELS; ++k) { - // Left-over accumulations - for(; k < _ISRC_CHANNELS; ++k) - { - TILE(SRC_DATA_TYPE, M0, 1, a); - TILE(WEI_DATA_TYPE, N0, 1, b); + TILE(SRC_DATA_TYPE, M0, 1, a); + TILE(WEI_DATA_TYPE, N0, 1, b); - LOOP_UNROLLING(int, i, 0, 1, M0, - { - a[i].v = ZERO_VALUE; - }) + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = ZERO_VALUE; + }) - // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); - // Load tile from the weights tensor - // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration - T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); + // Load tile from the weights tensor + // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration + T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); - // Compute the matrix multiplication between two tiles - T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); + // Compute the matrix multiplication between two tiles + T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); - // Apply the offset correction (operation usually needed for asymmetric quantized computation) - // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero - T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); + // Apply the offset correction (operation usually needed for asymmetric quantized computation) + // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero + T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); - ++ck; - } + ++ck; } +#endif // defined(LEFTOVER_LOOP) } // Offset correction required for the quantized asymmetric computation diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index 7107def8ff..5af7aa9662 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -453,6 +453,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DM0=" + support::cpp11::to_string(m0)); build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP"); build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); if(is_data_type_quantized(data_type)) diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp index 92b22e758d..2b9a51df22 100644 --- a/src/gpu/cl/operators/ClConv2d.cpp +++ b/src/gpu/cl/operators/ClConv2d.cpp @@ -258,20 +258,15 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; const bool is_ifm_gt_ofm = weights->dimension(0U) * weights->dimension(1U) * weights->dimension(2U) > weights->dimension(3U); - const bool is_ofm_le_4 = weights->dimension(3U) <= 4; // Run Winograd if valid and IFM >= 16 if(is_wino_valid && is_ifm_ge_16) { return ConvolutionMethod::WINOGRAD; } - // Run Direct for Large kernel size - if(is_large_kernel_sz && is_ifm_gt_ofm && is_direct_valid) - { - return ConvolutionMethod::DIRECT; - } - if(is_ofm_le_4 && is_ifm_gt_ofm && is_direct_valid) + // Run Direct for Large kernel size + if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm) { return ConvolutionMethod::DIRECT; } -- cgit v1.2.1