From 04b39e8e56112dabf6f5746117354680a9985841 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 7 Jul 2021 10:25:41 +0100 Subject: Limit the LOOP_UNROLLING on the kernel height To reduce the risk of having a long OpenCL kernel, we limit the loop unrolling on the kernel height. In particular, we unroll only if the kernel height is less than or equal to 5 Resolves COMPMID-4604 Change-Id: Iece787989f36afb90f1c7676b53d9015e652bdbd Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5916 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins --- src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl | 15 +++++++++++---- src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl | 13 ++++++++++--- 2 files changed, 21 insertions(+), 7 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl index 1ec85f37d3..1f940001f3 100644 --- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl @@ -146,7 +146,11 @@ __kernel void dwc_native_fp_nhwc( c[i].v = 0; }) +#if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 { TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); @@ -169,10 +173,13 @@ __kernel void dwc_native_fp_nhwc( { LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, { - c[m0].v += a[xk + m0].v *b[xk].v; + c[m0].v += a[xk + m0].v * b[xk].v; }) }) - }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 #if defined(HAS_BIAS) TILE(BIA_DATA_TYPE, 1, N0, bias0); @@ -195,7 +202,7 @@ __kernel void dwc_native_fp_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE_PARTIAL(N0, PARTIAL_N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } else @@ -204,7 +211,7 @@ __kernel void dwc_native_fp_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE(N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } } diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl index cd0f29230d..aa6ba4de39 100644 --- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl @@ -177,7 +177,11 @@ __kernel void dwc_native_quantized_nhwc( c[i].v = 0; }) +#if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 { TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); @@ -227,7 +231,10 @@ __kernel void dwc_native_quantized_nhwc( #endif // _IWEI_WIDTH <= 16 }) }) - }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 #if _IWEI_WIDTH <= 16 T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c); @@ -260,7 +267,7 @@ __kernel void dwc_native_quantized_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE_PARTIAL(N0, PARTIAL_N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } else @@ -269,7 +276,7 @@ __kernel void dwc_native_quantized_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE(N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } } -- cgit v1.2.1