aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl15
-rw-r--r--src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl13
2 files changed, 21 insertions, 7 deletions
diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
index 1ec85f37d3..1f940001f3 100644
--- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
@@ -146,7 +146,11 @@ __kernel void dwc_native_fp_nhwc(
c[i].v = 0;
})
+#if _IWEI_HEIGHT <= 5
LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else // _IWEI_HEIGHT <= 5
+ for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
{
TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
@@ -169,10 +173,13 @@ __kernel void dwc_native_fp_nhwc(
{
LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
{
- c[m0].v += a[xk + m0].v *b[xk].v;
+ c[m0].v += a[xk + m0].v * b[xk].v;
})
})
- })
+ }
+#if _IWEI_HEIGHT <= 5
+ )
+#endif // _IWEI_HEIGHT <= 5
#if defined(HAS_BIAS)
TILE(BIA_DATA_TYPE, 1, N0, bias0);
@@ -195,7 +202,7 @@ __kernel void dwc_native_fp_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE_PARTIAL(N0, PARTIAL_N0)
- (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
else
@@ -204,7 +211,7 @@ __kernel void dwc_native_fp_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE(N0)
- (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
}
diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
index cd0f29230d..aa6ba4de39 100644
--- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
@@ -177,7 +177,11 @@ __kernel void dwc_native_quantized_nhwc(
c[i].v = 0;
})
+#if _IWEI_HEIGHT <= 5
LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else // _IWEI_HEIGHT <= 5
+ for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
{
TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
@@ -227,7 +231,10 @@ __kernel void dwc_native_quantized_nhwc(
#endif // _IWEI_WIDTH <= 16
})
})
- })
+ }
+#if _IWEI_HEIGHT <= 5
+ )
+#endif // _IWEI_HEIGHT <= 5
#if _IWEI_WIDTH <= 16
T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
@@ -260,7 +267,7 @@ __kernel void dwc_native_quantized_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE_PARTIAL(N0, PARTIAL_N0)
- (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
else
@@ -269,7 +276,7 @@ __kernel void dwc_native_quantized_nhwc(
{
int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
VSTORE(N0)
- (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w));
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
})
}
}