aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2022-08-31 11:47:08 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2022-09-07 13:09:57 +0000
commit211a55d8218764c0a20d69d4cbdaea1906291c6b (patch)
treed8852d04cfefcd8868ad142449fa707adb6c1c03
parent13a2d003fd63e03d67ad28f608082126bf04046b (diff)
downloadComputeLibrary-211a55d8218764c0a20d69d4cbdaea1906291c6b.tar.gz
Optimize depthwise convolution on OpenCL
The optimization concerns the case where the depth multiplier is > 1. The depth multiplier for loop has been removed from the OpenCL kernel and the GWS has been mapped to the output shape. In this way, we can still perform a tile with N0 columns and improve the performance of depthwise conv over 80% when depth multiplier is > 1. Resolves COMPMID-5568 Change-Id: I604e287d4eeb31c54b9cc6c3072a698cd0e3e136 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8184 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl90
-rw-r--r--src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl171
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp19
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp24
-rw-r--r--tests/validation/CL/DepthwiseConvolutionLayer.cpp4
-rw-r--r--tests/validation/CL/DepthwiseConvolutionLayerNative.cpp123
6 files changed, 276 insertions, 155 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index b24a6ae85f..8b14b27643 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -25,7 +25,8 @@
#include "activation_float_helpers.h"
#include "helpers.h"
#include "tile_helpers.h"
-
+// *INDENT-OFF*
+// clang-format off
#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
//! @cond Doxygen_Suppress
/** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
@@ -51,6 +52,7 @@
* - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
* - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
* @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
*
* @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -101,7 +103,7 @@ __kernel void dwc_native_fp_nhwc(
#define _IWEI_WIDTH WEI_WIDTH
#define _IWEI_HEIGHT WEI_HEIGHT
#define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
-#define _IN0_A N0 // Cols tile A
+#define _IN0_A N0_A // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
#define _IM0_B _IWEI_WIDTH // Rows tile B
#define _IN0_B N0 // Cols tile B
#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
@@ -121,39 +123,34 @@ __kernel void dwc_native_fp_nhwc(
xi -= PAD_LEFT;
yi -= PAD_TOP;
- int d = 0;
-#if DEPTH_MULTIPLIER != 1
- for(; d < DEPTH_MULTIPLIER; d++)
-#endif // DEPTH_MULTIPLIER != 1
- {
- TILE(ACC_DATA_TYPE, M0, N0, c);
+ TILE(ACC_DATA_TYPE, M0, N0, c);
- // Reset accumulators
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- c[i].v = 0;
- })
+ // Reset accumulators
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = 0;
+ })
#if _IWEI_HEIGHT < 5
- LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+ LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
#else // _IWEI_HEIGHT <= 5
- for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+ for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
#endif // _IWEI_HEIGHT <= 5
- {
- TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+ {
+ TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
- LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+ LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
{
a[i].v = 0;
})
// Load tile from the src tensor (TILE A)
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
// Load tile from the weights tensor (TILE B)
- T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
+ T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
// Optimized path for STRIDE_X == 1
// If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
@@ -161,47 +158,48 @@ __kernel void dwc_native_fp_nhwc(
{
LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
{
- c[m0].v += a[xk + m0].v *b[xk].v;
+ c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v);
})
})
- }
+ }
#if _IWEI_HEIGHT < 5
)
#endif // _IWEI_HEIGHT <= 5
#if defined(HAS_BIAS)
- TILE(BIA_DATA_TYPE, 1, N0, bias0);
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
- T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0);
+ T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
- // c = c + bias[broadcasted]
- T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
#endif // HAS_BIAS
- T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+ T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
- TILE(uint, M0, 1, dst_indirect_y);
+ TILE(uint, M0, 1, dst_indirect_y);
- bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
- if(x_cond)
+ if(x_cond)
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
{
- LOOP_UNROLLING(int, m0, 0, 1, M0,
- {
- int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
- VSTORE_PARTIAL(N0, PARTIAL_N0)
- (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
- })
- }
- else
+ int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+ VSTORE_PARTIAL(N0, PARTIAL_N0)
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
+ }
+ else
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
{
- LOOP_UNROLLING(int, m0, 0, 1, M0,
- {
- int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
- VSTORE(N0)
- (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
- })
- }
+ int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+ VSTORE(N0)
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
}
}
-#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) \ No newline at end of file
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index 263a23ef28..e2ffd444dd 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -25,6 +25,8 @@
#include "helpers.h"
#include "tile_helpers.h"
+// *INDENT-OFF*
+// clang-format off
#define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE)
#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE
#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0)
@@ -35,11 +37,11 @@
#define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \
({})
-#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL() \
- TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers); \
- TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts); \
- T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, multipliers); \
- T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, shifts);
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL() \
+ TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers); \
+ TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts); \
+ T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout, 0, 0, 0, multipliers); \
+ T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout, 0, 0, 0, shifts);
#define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
#define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
@@ -73,6 +75,7 @@
* - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
* - N0 = 2, 3, 4, 8, 16
* @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
*
* @param[in] src_ptr Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -133,7 +136,7 @@ __kernel void dwc_native_quantized_nhwc(
#define _IWEI_WIDTH WEI_WIDTH
#define _IWEI_HEIGHT WEI_HEIGHT
#define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
-#define _IN0_A N0 // Cols tile A
+#define _IN0_A N0_A // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
#define _IM0_B _IWEI_WIDTH // Rows tile B
#define _IN0_B N0 // Cols tile B
#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
@@ -153,121 +156,117 @@ __kernel void dwc_native_quantized_nhwc(
xi -= PAD_LEFT;
yi -= PAD_TOP;
- int d = 0;
-#if DEPTH_MULTIPLIER != 1
- for(; d < DEPTH_MULTIPLIER; d++)
-#endif // DEPTH_MULTIPLIER != 1
- {
- TILE(ACC_DATA_TYPE, M0, N0, c);
+ TILE(ACC_DATA_TYPE, M0, N0, c);
- // Reset accumulators
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- c[i].v = 0;
- })
+ // Reset accumulators
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = 0;
+ })
#if _IWEI_HEIGHT <= 5
- LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+ LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
#else // _IWEI_HEIGHT <= 5
- for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+ for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
#endif // _IWEI_HEIGHT <= 5
- {
- TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+ {
+ TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
- LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
- {
- a[i].v = ZERO_VALUE;
- })
+ LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+ {
+ a[i].v = ZERO_VALUE;
+ })
- // Load tile from the src tensor (TILE A)
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+ // Load tile from the src tensor (TILE A)
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
- TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+ TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
- // Load tile from the weights tensor (TILE B)
- T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
+ // Load tile from the weights tensor (TILE B)
+ T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
- // Optimized path for STRIDE_X == 1
- // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
- LOOP_UNROLLING(int, m0, 0, 1, M0,
+ // Optimized path for STRIDE_X == 1
+ // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ LOOP_UNROLLING(int, n0, 0, 1, N0,
{
- LOOP_UNROLLING(int, n0, 0, 1, N0,
- {
#if _IWEI_WIDTH <= 16
#define DOT_DATA_TYPE SRC_DATA_TYPE
#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
- // Optimized path for the dot instruction
- TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
- TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
- ACC_DATA_TYPE offset_a = 0;
- ACC_DATA_TYPE offset_b = 0;
+ // Optimized path for the dot instruction
+ TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+ TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+ ACC_DATA_TYPE offset_a = 0;
+ ACC_DATA_TYPE offset_b = 0;
- LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
- {
- x0[0].s[xk] = a[xk + m0].s[n0];
- y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
- })
- DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
- REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
- REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
- c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+ {
+ x0[0].s[xk] = a[xk + m0].s[n0];
+ y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+ })
+ DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+ REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+ REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+ c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
#else // _IWEI_WIDTH <= 16
- LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
- {
- c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
- })
-#endif // _IWEI_WIDTH <= 16
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+ {
+ c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
})
+#endif // _IWEI_WIDTH <= 16
})
- }
+ })
+ }
#if _IWEI_HEIGHT <= 5
- )
+ )
#endif // _IWEI_HEIGHT <= 5
#if _IWEI_WIDTH <= 16
- T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
+ T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
#endif // _IWEI_WIDTH <= 16
#if defined(HAS_BIAS)
- TILE(BIA_DATA_TYPE, 1, N0, bias0);
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
- // Load bias
- T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0);
+ // Load bias
+ T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
- // c = c + bias[broadcasted]
- T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
#endif // HAS_BIAS
- T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
+ T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
- // Quantize the tile
- TILE(DST_DATA_TYPE, M0, N0, cq);
- T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
+ // Quantize the tile
+ TILE(DST_DATA_TYPE, M0, N0, cq);
+ T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
- // Perform activation
- T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
+ // Perform activation
+ T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
- bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
- if(x_cond)
+ if(x_cond)
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
{
- LOOP_UNROLLING(int, m0, 0, 1, M0,
- {
- int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
- VSTORE_PARTIAL(N0, PARTIAL_N0)
- (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
- })
- }
- else
+ int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+ VSTORE_PARTIAL(N0, PARTIAL_N0)
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
+ }
+ else
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
{
- LOOP_UNROLLING(int, m0, 0, 1, M0,
- {
- int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
- VSTORE(N0)
- (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
- })
- }
+ int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+ VSTORE(N0)
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
}
}
-#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) \ No newline at end of file
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on \ No newline at end of file
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 732d768308..277cba47a6 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -57,11 +57,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier > 1 && dwc_info.n0 != 1);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_weights_to_cl_image(weights) == false), "Export to cl_image not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && (conv_info.depth_multiplier > 1));
ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
@@ -85,6 +83,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+ if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
+ }
+
const bool is_quantized = is_data_type_quantized(input->data_type());
if(biases != nullptr)
@@ -199,7 +202,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
_export_to_cl_image = dwc_info.export_weights_to_cl_image;
_is_quantized = is_data_type_quantized(input->info()->data_type());
- const unsigned int n0 = adjust_vec_size(dwc_info.n0, input->info()->dimension(0));
+ const unsigned int n0 = adjust_vec_size(dwc_info.n0, output->info()->dimension(0));
const unsigned int m0 = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1));
std::string kernel_name = "";
@@ -251,7 +254,8 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
- build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
+ build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
// Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
@@ -349,13 +353,6 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
Window slice = window_collapsed.first_slice_window_4D();
- if(_depth_multiplier != 1)
- {
- // If the depth multiplier > 1, we need to use the input channels rather than the output channels
- ARM_COMPUTE_ERROR_ON(slice.x().step() != 1);
- slice.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1));
- }
-
cl::Image2D weights_cl_image;
if(_export_to_cl_image)
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index e821726d0e..8546471fdd 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -54,19 +54,22 @@ bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned i
const size_t kernel_w = weights->tensor_shape()[idx_w];
const size_t kernel_h = weights->tensor_shape()[idx_h];
- if((kernel_w == 1) && (kernel_h == 1))
+ if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
{
return false;
}
- if(depth_multiplier > 1)
+ if((kernel_w == 1) && (kernel_h == 1))
{
return false;
}
- if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+ if(depth_multiplier > 1)
{
- return false;
+ if((depth_multiplier % 4) != 0)
+ {
+ return false;
+ }
}
return true;
@@ -110,7 +113,18 @@ void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info,
}
else
{
- dwc_compute_info.n0 = 1;
+ if((depth_multiplier % 4) == 0)
+ {
+ dwc_compute_info.n0 = 4;
+ }
+ else if((depth_multiplier % 2) == 0)
+ {
+ dwc_compute_info.n0 = 2;
+ }
+ else
+ {
+ dwc_compute_info.n0 = 1;
+ }
}
dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index b2cff2b792..5d5562f678 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,7 +48,7 @@ constexpr RelativeTolerance<float> tolerance_f32(0.01f); /**<
constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
constexpr float tolerance_num = 0.05f; /**< Tolerance number */
-const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 5 });
+const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 4 });
const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 });
//Activation Functions
diff --git a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
index f565255719..45047cab44 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -242,6 +242,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
framework::ARM_COMPUTE_PRINT_INFO();
}
}
+
TEST_SUITE_END() // ExportWeightsToCLImage
TEST_SUITE_END() // FP32
@@ -287,6 +288,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
// Validate output
validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
}
+
TEST_SUITE(ExportWeightsToCLImage)
FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
@@ -347,6 +349,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
framework::ARM_COMPUTE_PRINT_INFO();
}
}
+
TEST_SUITE_END() // ExportWeightsToCLImage
TEST_SUITE_END() // FP16
TEST_SUITE_END() // Float
@@ -355,7 +358,7 @@ TEST_SUITE(Float)
TEST_SUITE(FP32)
FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- width_values_precommit,
+ framework::dataset::make("width", { 33U } ),
height_values_precommit),
channel_values_precommit),
batch_values_precommit),
@@ -376,7 +379,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- width_values_nightly,
+ framework::dataset::make("width", { 53U } ),
height_values_nightly),
channel_values_nightly),
batch_values_nightly),
@@ -394,12 +397,67 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
// Validate output
validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
}
+
+TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+ framework::dataset::make("width", { 33U } ),
+ height_values_precommit),
+ channel_values_precommit),
+ batch_values_precommit),
+ kernel_sz_values_precommit),
+ framework::dataset::make("depth_multiplier", 2)),
+ dilation_values),
+ stride_values),
+ padding_valid_values),
+ framework::dataset::make("DataType", DataType::F32)),
+ data_layout_values),
+ act_values),
+ framework::dataset::make("N0", {2})),
+ framework::dataset::make("ExportToCLImage", false)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+ framework::dataset::make("width", { 33U } ),
+ height_values_precommit),
+ channel_values_precommit),
+ batch_values_precommit),
+ kernel_sz_values_precommit),
+ framework::dataset::make("depth_multiplier", 4)),
+ dilation_values),
+ stride_values),
+ padding_valid_values),
+ framework::dataset::make("DataType", DataType::F32)),
+ data_layout_values),
+ act_values),
+ framework::dataset::make("N0", {4})),
+ framework::dataset::make("ExportToCLImage", true)))
+{
+ // Validate output
+ if(_validate_output)
+ {
+ // Validate output
+ validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+ }
+ else
+ {
+ ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+ framework::ARM_COMPUTE_PRINT_INFO();
+ }
+}
+TEST_SUITE_END() // ExportWeightsToCLImage
+TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels
TEST_SUITE_END() // FP32
TEST_SUITE(FP16)
FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- width_values_precommit,
+ framework::dataset::make("width", { 33U } ),
height_values_precommit),
channel_values_precommit),
batch_values_precommit),
@@ -420,7 +478,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- width_values_nightly,
+ framework::dataset::make("width", { 53U } ),
height_values_nightly),
channel_values_nightly),
batch_values_nightly),
@@ -438,6 +496,61 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
// Validate output
validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
}
+
+TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+ framework::dataset::make("width", { 33U } ),
+ height_values_precommit),
+ channel_values_precommit),
+ batch_values_precommit),
+ kernel_sz_values_precommit),
+ framework::dataset::make("depth_multiplier", 2)),
+ dilation_values),
+ stride_values),
+ padding_valid_values),
+ framework::dataset::make("DataType", DataType::F16)),
+ data_layout_values),
+ act_values),
+ framework::dataset::make("N0", {2})),
+ framework::dataset::make("ExportToCLImage", false)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+ framework::dataset::make("width", { 33U } ),
+ height_values_precommit),
+ channel_values_precommit),
+ batch_values_precommit),
+ kernel_sz_values_precommit),
+ framework::dataset::make("depth_multiplier", 4)),
+ dilation_values),
+ stride_values),
+ padding_valid_values),
+ framework::dataset::make("DataType", DataType::F16)),
+ data_layout_values),
+ act_values),
+ framework::dataset::make("N0", {4})),
+ framework::dataset::make("ExportToCLImage", true)))
+{
+ // Validate output
+ if(_validate_output)
+ {
+ // Validate output
+ validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+ }
+ else
+ {
+ ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+ framework::ARM_COMPUTE_PRINT_INFO();
+ }
+}
+TEST_SUITE_END() // ExportWeightsToCLImage
+TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels
TEST_SUITE_END() // FP16
TEST_SUITE_END() // Float
TEST_SUITE_END() // DepthMultiplier