diff options
-rw-r--r-- | src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl | 15 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl | 13 |
2 files changed, 21 insertions, 7 deletions
diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl index 1ec85f37d3..1f940001f3 100644 --- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl @@ -146,7 +146,11 @@ __kernel void dwc_native_fp_nhwc( c[i].v = 0; }) +#if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 { TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); @@ -169,10 +173,13 @@ __kernel void dwc_native_fp_nhwc( { LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, { - c[m0].v += a[xk + m0].v *b[xk].v; + c[m0].v += a[xk + m0].v * b[xk].v; }) }) - }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 #if defined(HAS_BIAS) TILE(BIA_DATA_TYPE, 1, N0, bias0); @@ -195,7 +202,7 @@ __kernel void dwc_native_fp_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE_PARTIAL(N0, PARTIAL_N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } else @@ -204,7 +211,7 @@ __kernel void dwc_native_fp_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE(N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } } diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl index cd0f29230d..aa6ba4de39 100644 --- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl @@ -177,7 +177,11 @@ __kernel void dwc_native_quantized_nhwc( c[i].v = 0; }) +#if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 { TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); @@ -227,7 +231,10 @@ __kernel void dwc_native_quantized_nhwc( #endif // _IWEI_WIDTH <= 16 }) }) - }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 #if _IWEI_WIDTH <= 16 T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c); @@ -260,7 +267,7 @@ __kernel void dwc_native_quantized_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE_PARTIAL(N0, PARTIAL_N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } else @@ -269,7 +276,7 @@ __kernel void dwc_native_quantized_nhwc( { int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); VSTORE(N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + ((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + xi_out * dst_stride_y + yo * dst_stride_z + bout * dst_stride_w)); + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); }) } } |