aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2021-12-01 09:26:14 +0000
committerGian Marco Iodice <gianmarco.iodice@arm.com>2021-12-01 12:38:25 +0000
commit78baa48308cba4101b4bcb4680f2f4ca90aeefd7 (patch)
tree62b049b4cb8cffc78559c14dcaa2dbcea2ebc871
parent56d55123527b5bb84a5c3516f161dd4438cdc7d8 (diff)
downloadComputeLibrary-78baa48308cba4101b4bcb4680f2f4ca90aeefd7.tar.gz
Improve start-up direct convolution on OpenCL
- Pass arguments at runtime - Rework ClConv2D heuristic to select direct convolution when OFM < IFM also for small kernel sizes Resolves COMPMID-5000 Change-Id: I9b538e29093829bc366d24d1e904341c247fa22b Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6771 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/nhwc/direct_convolution.cl61
-rw-r--r--src/gpu/cl/kernels/ClDirectConv2dKernel.cpp12
-rw-r--r--src/gpu/cl/operators/ClConv2d.cpp10
3 files changed, 42 insertions, 41 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index 75a7a0f004..35ff86a4fb 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -103,9 +103,9 @@
*/
//! @endcond
__kernel void direct_convolution_nhwc(
- TENSOR4D(src, SRC_TENSOR_TYPE),
- TENSOR4D(dst, DST_TENSOR_TYPE),
- TENSOR4D(wei, WEI_TENSOR_TYPE)
+ TENSOR4D_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_T(dst, DST_TENSOR_TYPE),
+ TENSOR4D_T(wei, WEI_TENSOR_TYPE)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bia)
@@ -116,12 +116,12 @@ __kernel void direct_convolution_nhwc(
// In case of dynamic tensor support, the following dimensions should be passed as function argument.
#define _IWEI_WIDTH WEI_WIDTH
#define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _ISRC_CHANNELS SRC_CHANNELS
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
+#define _ISRC_WIDTH src_w
+#define _ISRC_HEIGHT src_h
+#define _ISRC_CHANNELS src_c
+#define _IDST_WIDTH dst_w
+#define _IDST_HEIGHT dst_h
+#define _IDST_CHANNELS dst_c
#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
// If quantized, the output tile has to be quantized first before being stored to global memory
@@ -192,35 +192,36 @@ __kernel void direct_convolution_nhwc(
// We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
// This #if directive should be removed in case of dynamic tensor support
-#if((SRC_CHANNELS % K0) != 0)
- // Left-over accumulations
- for(; k < _ISRC_CHANNELS; ++k)
+ if((_ISRC_CHANNELS % K0) != 0)
{
- TILE(SRC_DATA_TYPE, M0, 1, a);
- TILE(WEI_DATA_TYPE, N0, 1, b);
-
- LOOP_UNROLLING(int, i, 0, 1, M0,
+ // Left-over accumulations
+ for(; k < _ISRC_CHANNELS; ++k)
{
- a[i].v = ZERO_VALUE;
- })
+ TILE(SRC_DATA_TYPE, M0, 1, a);
+ TILE(WEI_DATA_TYPE, N0, 1, b);
- // Load tile from the src tensor
- T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = ZERO_VALUE;
+ })
- // Load tile from the weights tensor
- // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
- T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+ // Load tile from the src tensor
+ T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
- // Compute the matrix multiplication between two tiles
- T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+ // Load tile from the weights tensor
+ // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+ T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
- // Apply the offset correction (operation usually needed for asymmetric quantized computation)
- // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
- T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+ // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
- ++ck;
+ ++ck;
+ }
}
-#endif // ((SRC_CHANNELS % K0) != 0)
}
// Offset correction required for the quantized asymmetric computation
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 2d851a6982..7107def8ff 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -438,14 +438,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
build_options.add_option("-cl-fast-relaxed-math");
build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
- build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
- build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
- build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
- build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
- build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
- build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx)));
build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
@@ -613,13 +607,13 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
}
unsigned int idx = 0;
- add_4D_tensor_argument(idx, src, slice);
- add_4D_tensor_argument(idx, dst, slice);
+ add_4d_tensor_nhwc_argument(idx, src);
+ add_4d_tensor_nhwc_argument(idx, dst);
if(export_to_cl_image)
{
_kernel.setArg(idx++, weights_cl_image);
}
- add_4D_tensor_argument(idx, weights, slice);
+ add_4d_tensor_nhwc_argument(idx, weights);
if(biases != nullptr)
{
add_1D_tensor_argument(idx, biases, slice);
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index d633c8f738..92b22e758d 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -257,7 +257,8 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
{
const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
- const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
+ const bool is_ifm_gt_ofm = weights->dimension(0U) * weights->dimension(1U) * weights->dimension(2U) > weights->dimension(3U);
+ const bool is_ofm_le_4 = weights->dimension(3U) <= 4;
// Run Winograd if valid and IFM >= 16
if(is_wino_valid && is_ifm_ge_16)
@@ -265,7 +266,12 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
return ConvolutionMethod::WINOGRAD;
}
// Run Direct for Large kernel size
- if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm)
+ if(is_large_kernel_sz && is_ifm_gt_ofm && is_direct_valid)
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+
+ if(is_ofm_le_4 && is_ifm_gt_ofm && is_direct_valid)
{
return ConvolutionMethod::DIRECT;
}