diff options
author | Giorgio Arena <giorgio.arena@arm.com> | 2021-05-13 16:58:51 +0100 |
---|---|---|
committer | Giorgio Arena <giorgio.arena@arm.com> | 2021-05-17 12:08:08 +0000 |
commit | bdd16d1c4832ed416f24908b2c1d060aa4e42f32 (patch) | |
tree | 58e9fa3ebeca7a1bfa0cca23481f61ed30b4fb08 /src/core/CL/cl_kernels/direct_convolution.cl | |
parent | 72ee9b4723485c3da077d765febf45f27acb78cb (diff) | |
download | ComputeLibrary-bdd16d1c4832ed416f24908b2c1d060aa4e42f32.tar.gz |
Add macro to manually unroll loops in OpenCL
Change-Id: I092d10534816f5b3717325952033c351b8231380
Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5643
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/direct_convolution.cl')
-rw-r--r-- | src/core/CL/cl_kernels/direct_convolution.cl | 24 |
1 files changed, 12 insertions, 12 deletions
diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl index a9a997f9ac..e303d2067d 100644 --- a/src/core/CL/cl_kernels/direct_convolution.cl +++ b/src/core/CL/cl_kernels/direct_convolution.cl @@ -141,21 +141,21 @@ __kernel void direct_convolution_nhwc( TILE(int, M0, 1, yi); // Convert the linear index to coordinate - LOOP_UNROLLING(int, i, 0, M0, 1) + LOOP_UNROLLING(int, i, 0, 1, M0, { xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X; yi[i].v = ((mout + i) / _IDST_WIDTH) * STRIDE_Y; xi[i].v -= PAD_LEFT; yi[i].v -= PAD_TOP; - } + }) // Initialize the accumulators TILE(ACC_DATA_TYPE, M0, N0, c); - LOOP_UNROLLING(int, i, 0, M0, 1) + LOOP_UNROLLING(int, i, 0, 1, M0, { c[i].v = 0; - } + }) for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { @@ -169,13 +169,13 @@ __kernel void direct_convolution_nhwc( TILE(SRC_DATA_TYPE, M0, K0, a); TILE(WEI_DATA_TYPE, N0, K0, b); - LOOP_UNROLLING(int, i, 0, M0, 1) + LOOP_UNROLLING(int, i, 0, 1, M0, { a[i].v = ZERO_VALUE; - } + }) // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, 1, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); // Load tile from the weights tensor T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); @@ -199,13 +199,13 @@ __kernel void direct_convolution_nhwc( TILE(SRC_DATA_TYPE, M0, 1, a); TILE(WEI_DATA_TYPE, N0, 1, b); - LOOP_UNROLLING(int, i, 0, M0, 1) + LOOP_UNROLLING(int, i, 0, 1, M0, { a[i].v = ZERO_VALUE; - } + }) // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, 1, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); // Load tile from the weights tensor // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration @@ -240,11 +240,11 @@ __kernel void direct_convolution_nhwc( TILE(uint, M0, 1, dst_indirect_y); // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, M0, 1) + LOOP_UNROLLING(int, i, 0, 1, M0, { dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); - } + }) bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; |