aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2022-09-16 14:14:21 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2022-11-01 09:24:45 +0000
commit3394f3e3df7fd2d924c41822a8564493fc06473a (patch)
tree8859ab95e39a237b204031a2aa68cde752003dde
parent910e3f9b686d16657e37d4c18f234b566c8deec2 (diff)
downloadComputeLibrary-3394f3e3df7fd2d924c41822a8564493fc06473a.tar.gz
Rework direct convolution heuristic on OpenCL
Resolves COMPMID-5634 Change-Id: I075de70d509d0c4430b4bcf3f218384e237a3a56 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/453708 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8473 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
-rw-r--r--src/core/CL/DefaultLWSHeuristics.cpp9
-rw-r--r--src/core/CL/cl_kernels/nhwc/direct_convolution.cl66
-rw-r--r--src/core/CL/cl_kernels/tile_helpers.h11
-rw-r--r--src/gpu/cl/kernels/ClDirectConv2dKernel.cpp6
-rw-r--r--src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp64
5 files changed, 113 insertions, 43 deletions
diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index c739b9dc03..a53fdbbab6 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -61,7 +61,14 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z)
if(gws_x < gws_y)
{
- return cl::NDRange(4, 16, 1);
+ if(gws_x < 4)
+ {
+ return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
+ }
+ else
+ {
+ return cl::NDRange(std::min(gws_x, static_cast<size_t>(4u)), 8, 1);
+ }
}
else
{
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index e602fbb525..2e7ed5a4ca 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -116,12 +116,12 @@ __kernel void direct_convolution_nhwc(
// In case of dynamic tensor support, the following dimensions should be passed as function argument.
#define _IWEI_WIDTH WEI_WIDTH
#define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH src_w
-#define _ISRC_HEIGHT src_h
-#define _ISRC_CHANNELS src_c
-#define _IDST_WIDTH dst_w
-#define _IDST_HEIGHT dst_h
-#define _IDST_CHANNELS dst_c
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
// If quantized, the output tile has to be quantized first before being stored to global memory
@@ -159,12 +159,25 @@ __kernel void direct_convolution_nhwc(
for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
{
- int ck = 0;
int xk = i % _IWEI_WIDTH;
int yk = i / _IWEI_WIDTH;
- int k = 0;
- for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+ TILE(int, M0, 1, my);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ int x_s = xi[i].v + xk;
+ int y_s = yi[i].v + yk;
+ my[i].v = x_s + y_s *_ISRC_WIDTH;
+ my[i].v = my[i].v + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+ my[i].v = select(-1, my[i].v, x_s >= 0);
+ my[i].v = select(-1, my[i].v, x_s < _ISRC_WIDTH);
+ my[i].v = select(-1, my[i].v, y_s >= 0);
+ my[i].v = select(-1, my[i].v, y_s < _ISRC_HEIGHT);
+ })
+
+ int ck = 0;
+ for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
{
TILE(SRC_DATA_TYPE, M0, K0, a);
TILE(WEI_DATA_TYPE, N0, K0, b);
@@ -175,13 +188,8 @@ __kernel void direct_convolution_nhwc(
a[i].v = ZERO_VALUE;
})
- LOOP_UNROLLING(int, i, 0, 1, N0,
- {
- b[i].v = ZERO_VALUE;
- })
-
// Load tile from the src tensor
- T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
+ T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, my, a);
// Load tile from the weights tensor
T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
@@ -192,15 +200,13 @@ __kernel void direct_convolution_nhwc(
// Apply the offset correction (correction usually needed for asymmetric quantized computation)
// The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
- ck += K0;
}
// We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
// This #if directive should be removed in case of dynamic tensor support
#if defined(LEFTOVER_LOOP)
// Left-over accumulations
- for(; k < _ISRC_CHANNELS; ++k)
+ for(; ck < _ISRC_CHANNELS; ++ck)
{
TILE(SRC_DATA_TYPE, M0, 1, a);
TILE(WEI_DATA_TYPE, N0, 1, b);
@@ -229,8 +235,6 @@ __kernel void direct_convolution_nhwc(
// Apply the offset correction (operation usually needed for asymmetric quantized computation)
// The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
- ++ck;
}
#endif // defined(LEFTOVER_LOOP)
}
@@ -249,17 +253,6 @@ __kernel void direct_convolution_nhwc(
#endif // HAS_BIAS
- TILE(uint, M0, 1, dst_indirect_y);
-
- // Calculate the destination indirect Y
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
- dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
- })
-
- bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
#if defined(IS_QUANTIZED)
TILE(DST_DATA_TYPE, M0, N0, cq);
@@ -271,6 +264,17 @@ __kernel void direct_convolution_nhwc(
// Apply activation
T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+ dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
// _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
// Store the tile in reverse order so the invalid values are overwritten with the valid ones
T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 4b6144a22d..6279fb4fb6 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -653,6 +653,17 @@
}) \
})
+#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, yi, dst) \
+ ({ \
+ LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+ { \
+ if(yi[_i].v >= 0) \
+ { \
+ dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[_i].v, STRIDE_Y); \
+ } \
+ }) \
+ })
+
/** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
*
* @param[in] DATA_TYPE Data type
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index fd14f009e1..781627117a 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -242,6 +242,12 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
+ build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
+ build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(2)));
+ build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
+ build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
+ build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
diff --git a/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp
index ad94678335..b693568c67 100644
--- a/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp
+++ b/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -144,32 +144,45 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
const int32_t ofm = dst_shape[0];
const int32_t m = dst_shape[1] * dst_shape[2];
+ const int32_t k = wei_shape[0];
const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
desc.export_weights_to_cl_image = export_weights_to_cl_image;
if(dst_shape[0] <= 4)
{
+ // k0 should be as larger as possible. However, we should avoid
+ // having left-over for loops that make the implementation slower.
+ if((k % 16) == 0)
+ {
+ desc.k0 = 16;
+ }
+ else if((k % 8) == 0)
+ {
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.k0 = 4;
+ }
+
if(is_pointwise)
{
if(ofm == 4)
{
desc.m0 = 1;
desc.n0 = 4;
- desc.k0 = 16;
}
else
{
desc.m0 = 1;
desc.n0 = 1;
- desc.k0 = 16;
}
}
else
{
desc.m0 = 1;
desc.n0 = dst_shape[0];
- desc.k0 = 16;
}
}
else
@@ -178,21 +191,50 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
{
desc.m0 = 1;
desc.n0 = 1;
- desc.k0 = 16;
+ if((k % 16) == 0)
+ {
+ desc.k0 = 16;
+ }
+ else if((k % 8) == 0)
+ {
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.k0 = 4;
+ }
}
else
{
- if(ofm > 16)
+ if(ofm >= 16)
{
- desc.m0 = 4;
- desc.n0 = 4;
- desc.k0 = 8;
+ if(m / 6 > 24000)
+ {
+ desc.m0 = 6;
+ }
+ else
+ {
+ desc.m0 = 5;
+ }
+ desc.n0 = 8;
+ desc.k0 = 4;
}
else
{
- desc.m0 = 4;
- desc.n0 = 4;
- desc.k0 = 16;
+ desc.m0 = 2;
+ desc.n0 = 8;
+ if((k % 16) == 0)
+ {
+ desc.k0 = 16;
+ }
+ else if((k % 8) == 0)
+ {
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.k0 = 4;
+ }
}
}
}