Optimize depthwise convolution on OpenCL

The optimization concerns the case where the depth multiplier is > 1. The depth multiplier for loop has been removed from the OpenCL kernel and the GWS has been mapped to the output shape. In this way, we can still perform a tile with N0 columns and improve the performance of depthwise conv over 80% when depth multiplier is > 1. Resolves COMPMID-5568 Change-Id: I604e287d4eeb31c54b9cc6c3072a698cd0e3e136 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8184 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2022-08-31 11:47:08 +0100
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2022-09-07 13:09:57 +0000
commit: 211a55d8218764c0a20d69d4cbdaea1906291c6b (patch)
tree: d8852d04cfefcd8868ad142449fa707adb6c1c03
parent: 13a2d003fd63e03d67ad28f608082126bf04046b (diff)
download: ComputeLibrary-211a55d8218764c0a20d69d4cbdaea1906291c6b.tar.gz
6 files changed, 276 insertions, 155 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index b24a6ae85f..8b14b27643 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -25,7 +25,8 @@
 #include "activation_float_helpers.h"
 #include "helpers.h"
 #include "tile_helpers.h"
-
+// *INDENT-OFF*
+// clang-format off
 #if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
 //! @cond Doxygen_Suppress
 /** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
@@ -51,6 +52,7 @@
  *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
  *  - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -101,7 +103,7 @@ __kernel void dwc_native_fp_nhwc(
 #define _IWEI_WIDTH WEI_WIDTH
 #define _IWEI_HEIGHT WEI_HEIGHT
 #define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
-#define _IN0_A N0          // Cols tile A
+#define _IN0_A N0_A        // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
 #define _IM0_B _IWEI_WIDTH // Rows tile B
 #define _IN0_B N0          // Cols tile B
 #define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
@@ -121,39 +123,34 @@ __kernel void dwc_native_fp_nhwc(
     xi -= PAD_LEFT;
     yi -= PAD_TOP;
 
-    int d = 0;
-#if DEPTH_MULTIPLIER != 1
-    for(; d < DEPTH_MULTIPLIER; d++)
-#endif // DEPTH_MULTIPLIER != 1
-    {
-        TILE(ACC_DATA_TYPE, M0, N0, c);
+    TILE(ACC_DATA_TYPE, M0, N0, c);
 
-        // Reset accumulators
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            c[i].v = 0;
-        })
+    // Reset accumulators
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
 
 #if _IWEI_HEIGHT < 5
-        LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
 #else  // _IWEI_HEIGHT <= 5
-        for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+    for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
 #endif // _IWEI_HEIGHT <= 5
-                       {
-                           TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+    {
+        TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
 
-                           LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+        LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
         {
             a[i].v = 0;
         })
 
         // Load tile from the src tensor (TILE A)
-        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
 
         TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
 
         // Load tile from the weights tensor (TILE B)
-        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
+        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
 
         // Optimized path for STRIDE_X == 1
         // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
@@ -161,47 +158,48 @@ __kernel void dwc_native_fp_nhwc(
         {
             LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
             {
-                c[m0].v += a[xk + m0].v *b[xk].v;
+                c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v);
             })
         })
-                       }
+    }
 #if _IWEI_HEIGHT < 5
                       )
 #endif // _IWEI_HEIGHT <= 5
 
 #if defined(HAS_BIAS)
-        TILE(BIA_DATA_TYPE, 1, N0, bias0);
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
 
-        T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0);
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
 
-        // c = c + bias[broadcasted]
-        T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 #endif // HAS_BIAS
 
-        T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+    T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
 
-        TILE(uint, M0, 1, dst_indirect_y);
+    TILE(uint, M0, 1, dst_indirect_y);
 
-        bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
-        if(x_cond)
+    if(x_cond)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
         {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
-                VSTORE_PARTIAL(N0, PARTIAL_N0)
-                (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
-        else
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE_PARTIAL(N0, PARTIAL_N0)
+            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
         {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
-                VSTORE(N0)
-                (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE(N0)
+            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
     }
 }
-#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
-\ No newline at end of file
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index 263a23ef28..e2ffd444dd 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -25,6 +25,8 @@
 #include "helpers.h"
 #include "tile_helpers.h"
 
+// *INDENT-OFF*
+// clang-format off
 #define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE)
 #define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE
 #define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0)
@@ -35,11 +37,11 @@
 #define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \
     ({})
 
-#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL()                                                                           \
-    TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers);                                                                 \
-    TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts);                                                                           \
-    T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, multipliers); \
-    T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, shifts);
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL()                                                     \
+    TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers);                                           \
+    TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts);                                                     \
+    T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout, 0, 0, 0, multipliers); \
+    T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout, 0, 0, 0, shifts);
 
 #define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
 #define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
@@ -73,6 +75,7 @@
  *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
  *  - N0 = 2, 3, 4, 8, 16
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
  *
  * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
@@ -133,7 +136,7 @@ __kernel void dwc_native_quantized_nhwc(
 #define _IWEI_WIDTH WEI_WIDTH
 #define _IWEI_HEIGHT WEI_HEIGHT
 #define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
-#define _IN0_A N0          // Cols tile A
+#define _IN0_A N0_A        // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
 #define _IM0_B _IWEI_WIDTH // Rows tile B
 #define _IN0_B N0          // Cols tile B
 #define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
@@ -153,121 +156,117 @@ __kernel void dwc_native_quantized_nhwc(
     xi -= PAD_LEFT;
     yi -= PAD_TOP;
 
-    int d = 0;
-#if DEPTH_MULTIPLIER != 1
-    for(; d < DEPTH_MULTIPLIER; d++)
-#endif // DEPTH_MULTIPLIER != 1
-    {
-        TILE(ACC_DATA_TYPE, M0, N0, c);
+    TILE(ACC_DATA_TYPE, M0, N0, c);
 
-        // Reset accumulators
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            c[i].v = 0;
-        })
+    // Reset accumulators
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
 
 #if _IWEI_HEIGHT <= 5
-        LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
 #else  // _IWEI_HEIGHT <= 5
-        for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+    for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
 #endif // _IWEI_HEIGHT <= 5
-                       {
-                           TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+                   {
+                       TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
 
-                           LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
-        {
-            a[i].v = ZERO_VALUE;
-        })
+                       LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+    {
+        a[i].v = ZERO_VALUE;
+    })
 
-        // Load tile from the src tensor (TILE A)
-        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+    // Load tile from the src tensor (TILE A)
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
 
-        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+    TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
 
-        // Load tile from the weights tensor (TILE B)
-        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
+    // Load tile from the weights tensor (TILE B)
+    T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
 
-        // Optimized path for STRIDE_X == 1
-        // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
-        LOOP_UNROLLING(int, m0, 0, 1, M0,
+    // Optimized path for STRIDE_X == 1
+    // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
         {
-            LOOP_UNROLLING(int, n0, 0, 1, N0,
-            {
 #if _IWEI_WIDTH <= 16
 #define DOT_DATA_TYPE SRC_DATA_TYPE
 #define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
 
-                // Optimized path for the dot instruction
-                TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
-                TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
-                ACC_DATA_TYPE offset_a = 0;
-                ACC_DATA_TYPE offset_b = 0;
+            // Optimized path for the dot instruction
+            TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+            TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+            ACC_DATA_TYPE offset_a = 0;
+            ACC_DATA_TYPE offset_b = 0;
 
-                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                {
-                    x0[0].s[xk] = a[xk + m0].s[n0];
-                    y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
-                })
-                DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
-                REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
-                REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
-                c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+                x0[0].s[xk] = a[xk + m0].s[n0];
+                y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+            })
+            DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+            REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+            REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+            c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
 #else  // _IWEI_WIDTH <= 16
-                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                {
-                    c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
-                })
-#endif // _IWEI_WIDTH <= 16
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+                c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
             })
+#endif // _IWEI_WIDTH <= 16
         })
-                       }
+    })
+                   }
 #if _IWEI_HEIGHT <= 5
-                      )
+                  )
 #endif // _IWEI_HEIGHT <= 5
 
 #if _IWEI_WIDTH <= 16
-        T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
 #endif // _IWEI_WIDTH <= 16
 
 #if defined(HAS_BIAS)
-        TILE(BIA_DATA_TYPE, 1, N0, bias0);
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
 
-        // Load bias
-        T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0);
+    // Load bias
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
 
-        // c = c + bias[broadcasted]
-        T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 #endif // HAS_BIAS
 
-        T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
+    T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
 
-        // Quantize the tile
-        TILE(DST_DATA_TYPE, M0, N0, cq);
-        T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
+    // Quantize the tile
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
 
-        // Perform activation
-        T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
+    // Perform activation
+    T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
 
-        bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
-        if(x_cond)
+    if(x_cond)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
         {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
-                VSTORE_PARTIAL(N0, PARTIAL_N0)
-                (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
-        else
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE_PARTIAL(N0, PARTIAL_N0)
+            (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
         {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
-                VSTORE(N0)
-                (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE(N0)
+            (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
     }
 }
-#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
-\ No newline at end of file
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
+\ No newline at end of file
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 732d768308..277cba47a6 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -57,11 +57,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier > 1 && dwc_info.n0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_weights_to_cl_image(weights) == false), "Export to cl_image not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && (conv_info.depth_multiplier > 1));
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
@@ -85,6 +83,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
     const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
 
+    if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
+    }
+
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
     if(biases != nullptr)
@@ -199,7 +202,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     _export_to_cl_image = dwc_info.export_weights_to_cl_image;
     _is_quantized       = is_data_type_quantized(input->info()->data_type());
 
-    const unsigned int n0          = adjust_vec_size(dwc_info.n0, input->info()->dimension(0));
+    const unsigned int n0          = adjust_vec_size(dwc_info.n0, output->info()->dimension(0));
     const unsigned int m0          = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1));
     std::string        kernel_name = "";
 
@@ -251,7 +254,8 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
+    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
     build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
 
     // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
@@ -349,13 +353,6 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
 
     Window slice = window_collapsed.first_slice_window_4D();
 
-    if(_depth_multiplier != 1)
-    {
-        // If the depth multiplier > 1, we need to use the input channels rather than the output channels
-        ARM_COMPUTE_ERROR_ON(slice.x().step() != 1);
-        slice.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1));
-    }
-
     cl::Image2D weights_cl_image;
 
     if(_export_to_cl_image)
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index e821726d0e..8546471fdd 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -54,19 +54,22 @@ bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned i
     const size_t kernel_w = weights->tensor_shape()[idx_w];
     const size_t kernel_h = weights->tensor_shape()[idx_h];
 
-    if((kernel_w == 1) && (kernel_h == 1))
+    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
     {
         return false;
     }
 
-    if(depth_multiplier > 1)
+    if((kernel_w == 1) && (kernel_h == 1))
     {
         return false;
     }
 
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+    if(depth_multiplier > 1)
     {
-        return false;
+        if((depth_multiplier % 4) != 0)
+        {
+            return false;
+        }
     }
 
     return true;
@@ -110,7 +113,18 @@ void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info,
     }
     else
     {
-        dwc_compute_info.n0 = 1;
+        if((depth_multiplier % 4) == 0)
+        {
+            dwc_compute_info.n0 = 4;
+        }
+        else if((depth_multiplier % 2) == 0)
+        {
+            dwc_compute_info.n0 = 2;
+        }
+        else
+        {
+            dwc_compute_info.n0 = 1;
+        }
     }
 
     dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index b2cff2b792..5d5562f678 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@ constexpr RelativeTolerance<float>   tolerance_f32(0.01f);                  /**<
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);                  /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
 constexpr float                      tolerance_num = 0.05f;                 /**< Tolerance number */
 
-const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 5 });
+const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 4 });
 const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 });
 
 //Activation Functions
diff --git a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
index f565255719..45047cab44 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -242,6 +242,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+
 TEST_SUITE_END() // ExportWeightsToCLImage
 TEST_SUITE_END() // FP32
 
@@ -287,6 +288,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
 }
+
 TEST_SUITE(ExportWeightsToCLImage)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
@@ -347,6 +349,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+
 TEST_SUITE_END() // ExportWeightsToCLImage
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
@@ -355,7 +358,7 @@ TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_precommit,
+                                                                                                framework::dataset::make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
@@ -376,7 +379,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_nightly,
+                                                                                                framework::dataset::make("width", { 53U } ),
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
@@ -394,12 +397,67 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
+
+TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                framework::dataset::make("depth_multiplier", 2)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values),
+                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                data_layout_values),
+                                                                                                act_values),
+                                                                                                framework::dataset::make("N0", {2})),
+                                                                                                framework::dataset::make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                framework::dataset::make("depth_multiplier", 4)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values),
+                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                data_layout_values),
+                                                                                                act_values),
+                                                                                                framework::dataset::make("N0", {4})),
+                                                                                                framework::dataset::make("ExportToCLImage", true)))
+{
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // ExportWeightsToCLImage
+TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_precommit,
+                                                                                                framework::dataset::make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
@@ -420,7 +478,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_nightly,
+                                                                                                framework::dataset::make("width", { 53U } ),
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
@@ -438,6 +496,61 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
 }
+
+TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                framework::dataset::make("depth_multiplier", 2)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values),
+                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                data_layout_values),
+                                                                                                act_values),
+                                                                                                framework::dataset::make("N0", {2})),
+                                                                                                framework::dataset::make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                framework::dataset::make("depth_multiplier", 4)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values),
+                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                data_layout_values),
+                                                                                                act_values),
+                                                                                                framework::dataset::make("N0", {4})),
+                                                                                                framework::dataset::make("ExportToCLImage", true)))
+{
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // ExportWeightsToCLImage
+TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DepthMultiplier
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2022-08-31 11:47:08 +0100
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2022-09-07 13:09:57 +0000
commit	211a55d8218764c0a20d69d4cbdaea1906291c6b (patch)
tree	d8852d04cfefcd8868ad142449fa707adb6c1c03
parent	13a2d003fd63e03d67ad28f608082126bf04046b (diff)
download	ComputeLibrary-211a55d8218764c0a20d69d4cbdaea1906291c6b.tar.gz