aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2021-07-07 10:25:41 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2021-07-09 14:17:39 +0000
commit04b39e8e56112dabf6f5746117354680a9985841 (patch)
treeaf603999edcf0c82ac1fd6a861e3f8335befedf5
parent24b892072a2bd8190ba63d09fb0082113d7d032a (diff)
downloadComputeLibrary-04b39e8e56112dabf6f5746117354680a9985841.tar.gz
Limit the LOOP_UNROLLING on the kernel height
To reduce the risk of having a long OpenCL kernel, we limit the loop unrolling on the kernel height. In particular, we unroll only if the kernel height is less than or equal to 5 Resolves COMPMID-4604 Change-Id: Iece787989f36afb90f1c7676b53d9015e652bdbd Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5916 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl15
-rw-r--r--src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl13
2 files changed, 21 insertions, 7 deletions
diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
index 1ec85f37d3..1f940001f3 100644
--- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
@@ -146,7 +146,11 @@ __kernel void dwc_native_fp_nhwc(
c[i].v = 0;
})
+#if _IWEI_HEIGHT <= 5
LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else // _IWEI_HEIGHT <= 5
+ for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
{
TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
@@ -169,10 +173,13 @@ __kernel void dwc_native_fp_nhwc(
{
LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
{
- c[m0].v += a[xk + m0].v *b[xk].v;
+ c[m0].v += a[xk + m0].v * b[xk].v;
})
})
- })
+ }
+#if _IWEI_HEIGHT <= 5
+ )
+#endif // _IWEI_HEIGHT <= 5
#if defined(HAS_BIAS)
TILE(BIA_DATA_TYPE, 1, N0, bias0);
@@ -195,7 +202,7 @@ __kernel void dwc_native_fp_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE_PARTIAL(N0, PARTIAL_N0)
- (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
else
@@ -204,7 +211,7 @@ __kernel void dwc_native_fp_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE(N0)
- (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
}
diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
index cd0f29230d..aa6ba4de39 100644
--- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
@@ -177,7 +177,11 @@ __kernel void dwc_native_quantized_nhwc(
c[i].v = 0;
})
+#if _IWEI_HEIGHT <= 5
LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else // _IWEI_HEIGHT <= 5
+ for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
{
TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
@@ -227,7 +231,10 @@ __kernel void dwc_native_quantized_nhwc(
#endif // _IWEI_WIDTH <= 16
})
})
- })
+ }
+#if _IWEI_HEIGHT <= 5
+ )
+#endif // _IWEI_HEIGHT <= 5
#if _IWEI_WIDTH <= 16
T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
@@ -260,7 +267,7 @@ __kernel void dwc_native_quantized_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE_PARTIAL(N0, PARTIAL_N0)
- (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
else
@@ -269,7 +276,7 @@ __kernel void dwc_native_quantized_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE(N0)
- (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
}