From 211a55d8218764c0a20d69d4cbdaea1906291c6b Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 31 Aug 2022 11:47:08 +0100 Subject: Optimize depthwise convolution on OpenCL The optimization concerns the case where the depth multiplier is > 1. The depth multiplier for loop has been removed from the OpenCL kernel and the GWS has been mapped to the output shape. In this way, we can still perform a tile with N0 columns and improve the performance of depthwise conv over 80% when depth multiplier is > 1. Resolves COMPMID-5568 Change-Id: I604e287d4eeb31c54b9cc6c3072a698cd0e3e136 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8184 Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Benchmark: Arm Jenkins --- src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl | 90 ++++++----- .../cl_kernels/nhwc/dwc_native_quantized_nhwc.cl | 171 ++++++++++----------- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 19 +-- 3 files changed, 137 insertions(+), 143 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl index b24a6ae85f..8b14b27643 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl @@ -25,7 +25,8 @@ #include "activation_float_helpers.h" #include "helpers.h" #include "tile_helpers.h" - +// *INDENT-OFF* +// clang-format off #if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) //! @cond Doxygen_Suppress /** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16) @@ -51,6 +52,7 @@ * - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only) * - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE) * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) + * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) * * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -101,7 +103,7 @@ __kernel void dwc_native_fp_nhwc( #define _IWEI_WIDTH WEI_WIDTH #define _IWEI_HEIGHT WEI_HEIGHT #define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension) -#define _IN0_A N0 // Cols tile A +#define _IN0_A N0_A // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) #define _IM0_B _IWEI_WIDTH // Rows tile B #define _IN0_B N0 // Cols tile B #define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1))) @@ -121,39 +123,34 @@ __kernel void dwc_native_fp_nhwc( xi -= PAD_LEFT; yi -= PAD_TOP; - int d = 0; -#if DEPTH_MULTIPLIER != 1 - for(; d < DEPTH_MULTIPLIER; d++) -#endif // DEPTH_MULTIPLIER != 1 - { - TILE(ACC_DATA_TYPE, M0, N0, c); + TILE(ACC_DATA_TYPE, M0, N0, c); - // Reset accumulators - LOOP_UNROLLING(int, i, 0, 1, M0, - { - c[i].v = 0; - }) + // Reset accumulators + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) #if _IWEI_HEIGHT < 5 - LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, + LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, #else // _IWEI_HEIGHT <= 5 - for(int yk = 0; yk < _IWEI_HEIGHT; yk++) + for(int yk = 0; yk < _IWEI_HEIGHT; ++yk) #endif // _IWEI_HEIGHT <= 5 - { - TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); + { + TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); - LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, { a[i].v = 0; }) // Load tile from the src tensor (TILE A) - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a); TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); // Load tile from the weights tensor (TILE B) - T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b); + T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b); // Optimized path for STRIDE_X == 1 // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension @@ -161,47 +158,48 @@ __kernel void dwc_native_fp_nhwc( { LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, { - c[m0].v += a[xk + m0].v *b[xk].v; + c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v); }) }) - } + } #if _IWEI_HEIGHT < 5 ) #endif // _IWEI_HEIGHT <= 5 #if defined(HAS_BIAS) - TILE(BIA_DATA_TYPE, 1, N0, bias0); + TILE(BIA_DATA_TYPE, 1, N0, bias0); - T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0); + T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0); - // c = c + bias[broadcasted] - T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); #endif // HAS_BIAS - T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); + T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); - TILE(uint, M0, 1, dst_indirect_y); + TILE(uint, M0, 1, dst_indirect_y); - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - if(x_cond) + if(x_cond) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, { - LOOP_UNROLLING(int, m0, 0, 1, M0, - { - int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); - VSTORE_PARTIAL(N0, PARTIAL_N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); - }) - } - else + int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); + VSTORE_PARTIAL(N0, PARTIAL_N0) + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, { - LOOP_UNROLLING(int, m0, 0, 1, M0, - { - int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); - VSTORE(N0) - (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); - }) - } + int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); + VSTORE(N0) + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) } } -#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) \ No newline at end of file +#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) +// *INDENT-ON* +// clang-format on \ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl index 263a23ef28..e2ffd444dd 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl @@ -25,6 +25,8 @@ #include "helpers.h" #include "tile_helpers.h" +// *INDENT-OFF* +// clang-format off #define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) #define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE #define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0) @@ -35,11 +37,11 @@ #define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \ ({}) -#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL() \ - TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers); \ - TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts); \ - T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, multipliers); \ - T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, shifts); +#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL() \ + TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers); \ + TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts); \ + T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout, 0, 0, 0, multipliers); \ + T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout, 0, 0, 0, shifts); #define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) #define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE() @@ -73,6 +75,7 @@ * - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only) * - N0 = 2, 3, 4, 8, 16 * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) + * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) * * @param[in] src_ptr Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -133,7 +136,7 @@ __kernel void dwc_native_quantized_nhwc( #define _IWEI_WIDTH WEI_WIDTH #define _IWEI_HEIGHT WEI_HEIGHT #define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension) -#define _IN0_A N0 // Cols tile A +#define _IN0_A N0_A // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) #define _IM0_B _IWEI_WIDTH // Rows tile B #define _IN0_B N0 // Cols tile B #define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1))) @@ -153,121 +156,117 @@ __kernel void dwc_native_quantized_nhwc( xi -= PAD_LEFT; yi -= PAD_TOP; - int d = 0; -#if DEPTH_MULTIPLIER != 1 - for(; d < DEPTH_MULTIPLIER; d++) -#endif // DEPTH_MULTIPLIER != 1 - { - TILE(ACC_DATA_TYPE, M0, N0, c); + TILE(ACC_DATA_TYPE, M0, N0, c); - // Reset accumulators - LOOP_UNROLLING(int, i, 0, 1, M0, - { - c[i].v = 0; - }) + // Reset accumulators + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) #if _IWEI_HEIGHT <= 5 - LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, + LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, #else // _IWEI_HEIGHT <= 5 - for(int yk = 0; yk < _IWEI_HEIGHT; yk++) + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) #endif // _IWEI_HEIGHT <= 5 - { - TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); + { + TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); - LOOP_UNROLLING(int, i, 0, 1, _IM0_A, - { - a[i].v = ZERO_VALUE; - }) + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + { + a[i].v = ZERO_VALUE; + }) - // Load tile from the src tensor (TILE A) - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a); + // Load tile from the src tensor (TILE A) + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a); - TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); + TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); - // Load tile from the weights tensor (TILE B) - T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b); + // Load tile from the weights tensor (TILE B) + T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b); - // Optimized path for STRIDE_X == 1 - // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension - LOOP_UNROLLING(int, m0, 0, 1, M0, + // Optimized path for STRIDE_X == 1 + // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, n0, 0, 1, N0, { - LOOP_UNROLLING(int, n0, 0, 1, N0, - { #if _IWEI_WIDTH <= 16 #define DOT_DATA_TYPE SRC_DATA_TYPE #define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE)) - // Optimized path for the dot instruction - TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0); - TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0); - ACC_DATA_TYPE offset_a = 0; - ACC_DATA_TYPE offset_b = 0; + // Optimized path for the dot instruction + TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0); + TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0); + ACC_DATA_TYPE offset_a = 0; + ACC_DATA_TYPE offset_b = 0; - LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, - { - x0[0].s[xk] = a[xk + m0].s[n0]; - y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION; - }) - DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]); - REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a); - REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b); - c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET; + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { + x0[0].s[xk] = a[xk + m0].s[n0]; + y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION; + }) + DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]); + REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a); + REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b); + c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET; #else // _IWEI_WIDTH <= 16 - LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, - { - c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET)); - }) -#endif // _IWEI_WIDTH <= 16 + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { + c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET)); }) +#endif // _IWEI_WIDTH <= 16 }) - } + }) + } #if _IWEI_HEIGHT <= 5 - ) + ) #endif // _IWEI_HEIGHT <= 5 #if _IWEI_WIDTH <= 16 - T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c); + T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c); #endif // _IWEI_WIDTH <= 16 #if defined(HAS_BIAS) - TILE(BIA_DATA_TYPE, 1, N0, bias0); + TILE(BIA_DATA_TYPE, 1, N0, bias0); - // Load bias - T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0); + // Load bias + T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0); - // c = c + bias[broadcasted] - T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); #endif // HAS_BIAS - T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE); + T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE); - // Quantize the tile - TILE(DST_DATA_TYPE, M0, N0, cq); - T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq); + // Quantize the tile + TILE(DST_DATA_TYPE, M0, N0, cq); + T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq); - // Perform activation - T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq); + // Perform activation + T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq); - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - if(x_cond) + if(x_cond) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, { - LOOP_UNROLLING(int, m0, 0, 1, M0, - { - int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); - VSTORE_PARTIAL(N0, PARTIAL_N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); - }) - } - else + int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); + VSTORE_PARTIAL(N0, PARTIAL_N0) + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, { - LOOP_UNROLLING(int, m0, 0, 1, M0, - { - int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); - VSTORE(N0) - (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); - }) - } + int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1); + VSTORE(N0) + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) } } -#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) \ No newline at end of file +#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) +// *INDENT-ON* +// clang-format on \ No newline at end of file diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 732d768308..277cba47a6 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -57,11 +57,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier > 1 && dwc_info.n0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_weights_to_cl_image(weights) == false), "Export to cl_image not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && (conv_info.depth_multiplier > 1)); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); @@ -85,6 +83,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation }; const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); + if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1) + { + ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0); + } + const bool is_quantized = is_data_type_quantized(input->data_type()); if(biases != nullptr) @@ -199,7 +202,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & _export_to_cl_image = dwc_info.export_weights_to_cl_image; _is_quantized = is_data_type_quantized(input->info()->data_type()); - const unsigned int n0 = adjust_vec_size(dwc_info.n0, input->info()->dimension(0)); + const unsigned int n0 = adjust_vec_size(dwc_info.n0, output->info()->dimension(0)); const unsigned int m0 = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1)); std::string kernel_name = ""; @@ -251,7 +254,8 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1)); - build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_input->info()->dimension(0) % n0)); + build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0)); build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION"); // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll @@ -349,13 +353,6 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm Window slice = window_collapsed.first_slice_window_4D(); - if(_depth_multiplier != 1) - { - // If the depth multiplier > 1, we need to use the input channels rather than the output channels - ARM_COMPUTE_ERROR_ON(slice.x().step() != 1); - slice.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1)); - } - cl::Image2D weights_cl_image; if(_export_to_cl_image) -- cgit v1.2.1