diff options
Diffstat (limited to 'src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl')
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl | 90 |
1 files changed, 44 insertions, 46 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl index b24a6ae85f..8b14b27643 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl @@ -25,7 +25,8 @@ #include "activation_float_helpers.h" #include "helpers.h" #include "tile_helpers.h" - +// *INDENT-OFF* +// clang-format off #if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) //! @cond Doxygen_Suppress /** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16) @@ -51,6 +52,7 @@ * - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only) * - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE) * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) + * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) * * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -101,7 +103,7 @@ __kernel void dwc_native_fp_nhwc( #define _IWEI_WIDTH WEI_WIDTH #define _IWEI_HEIGHT WEI_HEIGHT #define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension) -#define _IN0_A N0 // Cols tile A +#define _IN0_A N0_A // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) #define _IM0_B _IWEI_WIDTH // Rows tile B #define _IN0_B N0 // Cols tile B #define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1))) @@ -121,39 +123,34 @@ __kernel void dwc_native_fp_nhwc( xi -= PAD_LEFT; yi -= PAD_TOP; - int d = 0; -#if DEPTH_MULTIPLIER != 1 - for(; d < DEPTH_MULTIPLIER; d++) -#endif // DEPTH_MULTIPLIER != 1 - { - TILE(ACC_DATA_TYPE, M0, N0, c); + TILE(ACC_DATA_TYPE, M0, N0, c); - // Reset accumulators - LOOP_UNROLLING(int, i, 0, 1, M0, - { - c[i].v = 0; - }) + // Reset accumulators + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) #if _IWEI_HEIGHT < 5 - LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, + LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, #else // _IWEI_HEIGHT <= 5 - for(int yk = 0; yk < _IWEI_HEIGHT; yk++) + for(int yk = 0; yk < _IWEI_HEIGHT; ++yk) #endif // _IWEI_HEIGHT <= 5 - { - TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); + { + TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); - LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, { a[i].v = 0; }) // Load tile from the src tensor (TILE A) - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a); TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); // Load tile from the weights tensor (TILE B) - T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b); + T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b); // Optimized path for STRIDE_X == 1 // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension @@ -161,47 +158,48 @@ __kernel void dwc_native_fp_nhwc( { LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, { - c[m0].v += a[xk + m0].v *b[xk].v; + c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v); }) }) - } + } #if _IWEI_HEIGHT < 5 ) #endif // _IWEI_HEIGHT <= 5 #if defined(HAS_BIAS) - TILE(BIA_DATA_TYPE, 1, N0, bias0); + TILE(BIA_DATA_TYPE, 1, N0, bias0); - T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0); + T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0); - // c = c + bias[broadcasted] - T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); #endif // HAS_BIAS - T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); + T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); - TILE(uint, M0, 1, dst_indirect_y); + TILE(uint, M0, 1, dst_indirect_y); - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - if(x_cond) + if(x_cond) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, { - LOOP_UNROLLING(int, m0, 0, 1, M0, - { - int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); - VSTORE_PARTIAL(N0, PARTIAL_N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); - }) - } - else + int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); + VSTORE_PARTIAL(N0, PARTIAL_N0) + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, { - LOOP_UNROLLING(int, m0, 0, 1, M0, - { - int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); - VSTORE(N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); - }) - } + int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); + VSTORE(N0) + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) } } -#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
\ No newline at end of file +#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) +// *INDENT-ON* +// clang-format on
\ No newline at end of file |