diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2021-07-07 10:25:41 +0100 |
---|---|---|
committer | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2021-07-09 14:17:39 +0000 |
commit | 04b39e8e56112dabf6f5746117354680a9985841 (patch) | |
tree | af603999edcf0c82ac1fd6a861e3f8335befedf5 /src | |
parent | 24b892072a2bd8190ba63d09fb0082113d7d032a (diff) | |
download | ComputeLibrary-04b39e8e56112dabf6f5746117354680a9985841.tar.gz |
Limit the LOOP_UNROLLING on the kernel height
To reduce the risk of having a long OpenCL kernel, we limit the loop
unrolling on the kernel height. In particular, we unroll only if the
kernel height is less than or equal to 5
Resolves COMPMID-4604
Change-Id: Iece787989f36afb90f1c7676b53d9015e652bdbd
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5916
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl | 15 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl | 13 |
2 files changed, 21 insertions, 7 deletions
diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl index 1ec85f37d3..1f940001f3 100644 --- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl @@ -146,7 +146,11 @@ __kernel void dwc_native_fp_nhwc( c[i].v = 0; }) +#if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 { TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); @@ -169,10 +173,13 @@ __kernel void dwc_native_fp_nhwc( { LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, { - c[m0].v += a[xk + m0].v *b[xk].v; + c[m0].v += a[xk + m0].v * b[xk].v; }) }) - }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 #if defined(HAS_BIAS) TILE(BIA_DATA_TYPE, 1, N0, bias0); @@ -195,7 +202,7 @@ __kernel void dwc_native_fp_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE_PARTIAL(N0, PARTIAL_N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } else @@ -204,7 +211,7 @@ __kernel void dwc_native_fp_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE(N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } } diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl index cd0f29230d..aa6ba4de39 100644 --- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl @@ -177,7 +177,11 @@ __kernel void dwc_native_quantized_nhwc( c[i].v = 0; }) +#if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 { TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); @@ -227,7 +231,10 @@ __kernel void dwc_native_quantized_nhwc( #endif // _IWEI_WIDTH <= 16 }) }) - }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 #if _IWEI_WIDTH <= 16 T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c); @@ -260,7 +267,7 @@ __kernel void dwc_native_quantized_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE_PARTIAL(N0, PARTIAL_N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } else @@ -269,7 +276,7 @@ __kernel void dwc_native_quantized_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE(N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } } |