diff options
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/direct_convolution.cl | 61 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 12 | ||||
-rw-r--r-- | src/gpu/cl/operators/ClConv2d.cpp | 10 |
3 files changed, 42 insertions, 41 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index 75a7a0f004..35ff86a4fb 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -103,9 +103,9 @@ */ //! @endcond __kernel void direct_convolution_nhwc( - TENSOR4D(src, SRC_TENSOR_TYPE), - TENSOR4D(dst, DST_TENSOR_TYPE), - TENSOR4D(wei, WEI_TENSOR_TYPE) + TENSOR4D_T(src, SRC_TENSOR_TYPE), + TENSOR4D_T(dst, DST_TENSOR_TYPE), + TENSOR4D_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) @@ -116,12 +116,12 @@ __kernel void direct_convolution_nhwc( // In case of dynamic tensor support, the following dimensions should be passed as function argument. #define _IWEI_WIDTH WEI_WIDTH #define _IWEI_HEIGHT WEI_HEIGHT -#define _ISRC_WIDTH SRC_WIDTH -#define _ISRC_HEIGHT SRC_HEIGHT -#define _ISRC_CHANNELS SRC_CHANNELS -#define _IDST_WIDTH DST_WIDTH -#define _IDST_HEIGHT DST_HEIGHT -#define _IDST_CHANNELS DST_CHANNELS +#define _ISRC_WIDTH src_w +#define _ISRC_HEIGHT src_h +#define _ISRC_CHANNELS src_c +#define _IDST_WIDTH dst_w +#define _IDST_HEIGHT dst_h +#define _IDST_CHANNELS dst_c #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) // If quantized, the output tile has to be quantized first before being stored to global memory @@ -192,35 +192,36 @@ __kernel void direct_convolution_nhwc( // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS // This #if directive should be removed in case of dynamic tensor support -#if((SRC_CHANNELS % K0) != 0) - // Left-over accumulations - for(; k < _ISRC_CHANNELS; ++k) + if((_ISRC_CHANNELS % K0) != 0) { - TILE(SRC_DATA_TYPE, M0, 1, a); - TILE(WEI_DATA_TYPE, N0, 1, b); - - LOOP_UNROLLING(int, i, 0, 1, M0, + // Left-over accumulations + for(; k < _ISRC_CHANNELS; ++k) { - a[i].v = ZERO_VALUE; - }) + TILE(SRC_DATA_TYPE, M0, 1, a); + TILE(WEI_DATA_TYPE, N0, 1, b); - // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = ZERO_VALUE; + }) - // Load tile from the weights tensor - // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration - T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); - // Compute the matrix multiplication between two tiles - T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); + // Load tile from the weights tensor + // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration + T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); - // Apply the offset correction (operation usually needed for asymmetric quantized computation) - // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero - T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); + // Compute the matrix multiplication between two tiles + T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); + + // Apply the offset correction (operation usually needed for asymmetric quantized computation) + // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero + T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); - ++ck; + ++ck; + } } -#endif // ((SRC_CHANNELS % K0) != 0) } // Offset correction required for the quantized asymmetric computation diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index 2d851a6982..7107def8ff 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -438,14 +438,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-cl-fast-relaxed-math"); build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx))); - build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx))); - build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx))); build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx))); - build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx))); - build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx))); build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); @@ -613,13 +607,13 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl } unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice); + add_4d_tensor_nhwc_argument(idx, src); + add_4d_tensor_nhwc_argument(idx, dst); if(export_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } - add_4D_tensor_argument(idx, weights, slice); + add_4d_tensor_nhwc_argument(idx, weights); if(biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp index d633c8f738..92b22e758d 100644 --- a/src/gpu/cl/operators/ClConv2d.cpp +++ b/src/gpu/cl/operators/ClConv2d.cpp @@ -257,7 +257,8 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const { const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; - const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + const bool is_ifm_gt_ofm = weights->dimension(0U) * weights->dimension(1U) * weights->dimension(2U) > weights->dimension(3U); + const bool is_ofm_le_4 = weights->dimension(3U) <= 4; // Run Winograd if valid and IFM >= 16 if(is_wino_valid && is_ifm_ge_16) @@ -265,7 +266,12 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const return ConvolutionMethod::WINOGRAD; } // Run Direct for Large kernel size - if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm) + if(is_large_kernel_sz && is_ifm_gt_ofm && is_direct_valid) + { + return ConvolutionMethod::DIRECT; + } + + if(is_ofm_le_4 && is_ifm_gt_ofm && is_direct_valid) { return ConvolutionMethod::DIRECT; } |