From 9d3bd41030366326e9c8afe5db3a5812a76b135b Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 30 Dec 2022 09:45:00 +0000
Subject: Move DWC native heuristic into the heuristic folder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move the DWC native heuristic from CLDepthwiseConvolutionLayer to
heuristic/
- Update the heuristic for Arm® Mali™-G77. Use a smaller block size
(4x2) for Fp16
- Call the new heuristic in GpuDepthwiseConv2d

Resolves COMPMID-5798

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Change-Id: I6bfd30cea76bea2e98202a7a5c1d51709f3382a4
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8889
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |   3 +
 arm_compute/core/KernelDescriptors.h               |   4 +-
 filelist.json                                      |   3 +
 .../sketch/gpu/operators/GpuDepthwiseConv2d.cpp    | 129 ++-------
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   | 151 +---------
 .../dwc_native/ClDWCNativeDefaultConfigBifrost.cpp | 279 +++++++++++++++++++
 .../dwc_native/ClDWCNativeDefaultConfigBifrost.h   |  61 ++++
 .../dwc_native/ClDWCNativeDefaultConfigValhall.cpp | 306 +++++++++++++++++++++
 .../dwc_native/ClDWCNativeDefaultConfigValhall.h   |  59 ++++
 .../dwc_native/ClDWCNativeHeuristicsHelpers.cpp    |  61 ++++
 .../dwc_native/ClDWCNativeHeuristicsHelpers.h      |  45 +++
 .../dwc_native/ClDWCNativeKernelConfig.h           |  65 +++++
 .../dwc_native/IClDWCNativeKernelConfig.h          | 118 ++++++++
 .../ClIndirectConvDefaultConfigValhall.cpp         |   2 +-
 .../ClIndirectConvDefaultConfigValhall.h           |   2 +-
 .../indirect_conv/ClIndirectConvKernelConfig.h     |   2 +-
 .../indirect_conv/IClIndirectConvKernelConfig.h    |   6 +-
 17 files changed, 1036 insertions(+), 260 deletions(-)
 create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
 create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
 create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
 create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
 create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
 create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
 create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
 create mode 100644 src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h

diff --git a/Android.bp b/Android.bp
index bf6ee147f6..ec7bccd819 100644
--- a/Android.bp
+++ b/Android.bp
@@ -957,6 +957,9 @@ cc_library_static {
         "src/runtime/Utils.cpp",
         "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp",
         "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp",
+        "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp",
+        "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp",
+        "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp",
         "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp",
         "utils/CommonGraphOptions.cpp",
         "utils/GraphUtils.cpp",
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index cacbef25ea..4a64032b14 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -104,8 +104,8 @@ struct GEMMKernelInfo
 /** Compute descriptor used by the depthwise convolution native kernel */
 struct DWCComputeKernelInfo
 {
-    unsigned int n0{ 0 };                             /**< Number of columns processed by each thread */
-    unsigned int m0{ 0 };                             /**< Number of rows processed by each thread */
+    unsigned int n0{ 1 };                             /**< Number of columns processed by each thread */
+    unsigned int m0{ 1 };                             /**< Number of rows processed by each thread */
     bool         export_input_to_cl_image{ false };   /**< Export input to cl_image */
     bool         export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */
 };
diff --git a/filelist.json b/filelist.json
index 5828b43ec1..7bc47f7a5c 100644
--- a/filelist.json
+++ b/filelist.json
@@ -502,6 +502,9 @@
           "src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp",
           "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp",
           "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp",
+          "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp",
+          "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp",
+          "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp",
           "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp"
         ]
       }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
index caccbb1830..b08af61d8f 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
@@ -31,6 +31,8 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
 
 namespace arm_compute
 {
@@ -40,115 +42,6 @@ namespace dynamic_fusion
 {
 namespace
 {
-bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target)
-{
-    if(!export_to_cl_image(weights))
-    {
-        return false;
-    }
-
-    const size_t idx_w    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t kernel_w = weights->tensor_shape()[idx_w];
-    const size_t kernel_h = weights->tensor_shape()[idx_h];
-
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-    {
-        return false;
-    }
-
-    if((kernel_w == 1) && (kernel_h == 1))
-    {
-        return false;
-    }
-
-    if(depth_multiplier > 1)
-    {
-        if((depth_multiplier % 4) != 0)
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights,
-                                        const DepthwiseConv2dAttributes &attributes, const GPUTarget gpu_target)
-{
-    const unsigned int depth_multiplier = attributes.depth_multiplier();
-
-    // Floating point path
-    // First check if we can export to cl_image.
-    dwc_compute_info.export_input_to_cl_image   = false;
-    dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target);
-
-    // Set n0
-    if(depth_multiplier == 1)
-    {
-        if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16)
-        {
-            dwc_compute_info.n0 = 8;
-        }
-        else
-        {
-            dwc_compute_info.n0 = 4;
-        }
-    }
-    else
-    {
-        if((depth_multiplier % 4) == 0)
-        {
-            dwc_compute_info.n0 = 4;
-        }
-        else if((depth_multiplier % 2) == 0)
-        {
-            dwc_compute_info.n0 = 2;
-        }
-        else
-        {
-            dwc_compute_info.n0 = 1;
-        }
-    }
-
-    dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
-
-    // Set m0 only if stride_x == 1 and dilation_x == 1
-    if(attributes.stride().x() == 1 && attributes.dilation().x() == 1)
-    {
-        const size_t idx_w    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-        const size_t kernel_w = weights->tensor_shape()[idx_w];
-
-        if((kernel_w >= 9) || (kernel_w == 1))
-        {
-            dwc_compute_info.m0 = 1;
-        }
-        else
-        {
-            if(weights->data_type() == DataType::F16)
-            {
-                if((input->dimension(1) % 5) == 0)
-                {
-                    dwc_compute_info.m0 = 5;
-                }
-                else
-                {
-                    dwc_compute_info.m0 = 4;
-                }
-            }
-            else
-            {
-                dwc_compute_info.m0 = 2;
-            }
-        }
-    }
-    else
-    {
-        dwc_compute_info.m0 = 1;
-    }
-    return;
-}
-
 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes)
 {
     if(dst->total_size() == 0U)
@@ -202,8 +95,13 @@ Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext        &cont
             const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
             auto       settings   = ClComponentDepthwiseConv2d::Settings();
 
-            DWCComputeKernelInfo dwc_info;
-            initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                 attributes.pad().right,
+                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+            // Get the depthwise convolution compute parameters
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.fast_relaxed_math(
                 (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
@@ -294,8 +192,13 @@ void GpuDepthwiseConv2d::create_op(GpuWorkloadSketch               &sketch,
             const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
             auto       settings   = ClComponentDepthwiseConv2d::Settings();
 
-            DWCComputeKernelInfo dwc_info;
-            initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                 attributes.pad().right,
+                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+            // Get the depthwise convolution compute parameters
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
             .m0(dwc_info.m0)
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 3eadaee0de..3909c15352 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -23,15 +23,15 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
 
 #include "src/common/utils/Log.h"
 
@@ -39,137 +39,7 @@ namespace arm_compute
 {
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target)
-{
-    if(!export_to_cl_image(weights))
-    {
-        return false;
-    }
-
-    const size_t idx_w    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t kernel_w = weights->tensor_shape()[idx_w];
-    const size_t kernel_h = weights->tensor_shape()[idx_h];
-
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-    {
-        return false;
-    }
-
-    if((kernel_w == 1) && (kernel_h == 1))
-    {
-        return false;
-    }
-
-    if(depth_multiplier > 1)
-    {
-        if((depth_multiplier % 4) != 0)
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                        unsigned int depth_multiplier,
-                                        GPUTarget gpu_target)
-{
-    ARM_COMPUTE_UNUSED(input);
-
-    if(!is_data_type_float(weights->data_type()))
-    {
-        dwc_compute_info.export_weights_to_cl_image = false;
-        dwc_compute_info.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
-        {
-            dwc_compute_info.m0 = 2;
-        }
-        else
-        {
-            dwc_compute_info.m0 = 1;
-        }
-
-        return;
-    }
-
-    // Floating point path
-
-    // First check if we can export to cl_image.
-    dwc_compute_info.export_input_to_cl_image   = false;
-    dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target);
-
-    // Set n0
-    if(depth_multiplier == 1)
-    {
-        if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16)
-        {
-            dwc_compute_info.n0 = 8;
-        }
-        else
-        {
-            dwc_compute_info.n0 = 4;
-        }
-    }
-    else
-    {
-        if((depth_multiplier % 4) == 0)
-        {
-            dwc_compute_info.n0 = 4;
-        }
-        else if((depth_multiplier % 2) == 0)
-        {
-            dwc_compute_info.n0 = 2;
-        }
-        else
-        {
-            dwc_compute_info.n0 = 1;
-        }
-    }
-
-    dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
-
-    // Set m0 only if stride_x == 1 and dilation_x == 1
-    if(conv_info.stride().first == 1 && dilation.x() == 1)
-    {
-        const size_t idx_w    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-        const size_t kernel_w = weights->tensor_shape()[idx_w];
-
-        if((kernel_w >= 9) || (kernel_w == 1))
-        {
-            dwc_compute_info.m0 = 1;
-        }
-        else
-        {
-            if(weights->data_type() == DataType::F16)
-            {
-                if((input->dimension(1) % 5) == 0)
-                {
-                    dwc_compute_info.m0 = 5;
-                }
-                else
-                {
-                    dwc_compute_info.m0 = 4;
-                }
-            }
-            else
-            {
-                dwc_compute_info.m0 = 2;
-            }
-        }
-    }
-    else
-    {
-        dwc_compute_info.m0 = 1;
-    }
-    return;
-}
-
-} // namespace
+using namespace arm_compute::cl_dwc;
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
@@ -261,8 +131,9 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
         output_shifts_to_use      = &_output_shifts;
     }
 
-    DWCComputeKernelInfo dwc_native_compute_info;
-    initialize_dwc_native_compute_info(dwc_native_compute_info, input->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier, gpu_target);
+    // Get the depthwise convolution compute parameters
+    auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+    const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
 
     const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
 
@@ -346,8 +217,9 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
 
-        DWCComputeKernelInfo dwc_native_compute_info;
-        initialize_dwc_native_compute_info(dwc_native_compute_info, input, &permuted_weights, conv_info, dilation, depth_multiplier, gpu_target);
+        // Get the depthwise convolution compute parameters
+        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
                                                                                       dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
@@ -355,8 +227,9 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
     }
     else
     {
-        DWCComputeKernelInfo dwc_native_compute_info;
-        initialize_dwc_native_compute_info(dwc_native_compute_info, input, weights, conv_info, dilation, depth_multiplier, gpu_target);
+        // Get the depthwise convolution compute parameters
+        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier);
         ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
                                                                                       &output_multipliers_shifts_info));
     }
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..f55685ee49
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+namespace
+{
+DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                   unsigned int depth_multiplier, bool is_g71)
+{
+    DWCComputeKernelInfo desc;
+
+    if(src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image = false;
+
+        if(is_g71)
+        {
+            desc.export_weights_to_cl_image = false;
+        }
+        else
+        {
+            desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+        }
+
+        if(depth_multiplier == 1)
+        {
+            desc.n0 = 4;
+        }
+        else
+        {
+            if((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                   unsigned int depth_multiplier, bool is_g71)
+{
+    DWCComputeKernelInfo desc;
+
+    if(src->data_layout() == DataLayout::NHWC)
+    {
+        // Src and weights have the same dimension indices
+        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape src_shape = src->tensor_shape();
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      src_w     = src_shape[idx_w];
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image = false;
+
+        if(is_g71)
+        {
+            desc.export_weights_to_cl_image = false;
+        }
+        else
+        {
+            desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+        }
+
+        if(depth_multiplier == 1)
+        {
+            if(desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                if((src_w % 5) == 0)
+                {
+                    desc.m0 = 5;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                }
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace
+
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)
+    : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                unsigned int depth_multiplier)
+{
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                   unsigned int depth_multiplier);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32,
+                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32,
+                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch(_target)
+    {
+        case GPUTarget::G71:
+            func = configs_G71.get_function(src->data_type());
+            break;
+        default:
+            func = configs_G7x.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+    return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(wei);
+
+    DWCComputeKernelInfo desc;
+
+    if(src->data_layout() == DataLayout::NHWC)
+    {
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = false;
+        desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
+        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        {
+            desc.m0 = 2;
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
new file mode 100644
index 0000000000..cec2cae5dd
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Bifrost based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                   unsigned int depth_multiplier) override;
+
+private:
+    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                          unsigned int depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..49485c83a9
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu)
+    : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                unsigned int depth_multiplier)
+{
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                   unsigned int depth_multiplier);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
+                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
+                                                                         &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch(_target)
+    {
+        case GPUTarget::G77:
+            func = configs_G77.get_function(src->data_type());
+            break;
+        case GPUTarget::G78:
+        default:
+            func = configs_G78.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+    return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if(src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if(depth_multiplier == 1)
+        {
+            desc.n0 = 4;
+        }
+        else
+        {
+            if((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if(src->data_layout() == DataLayout::NHWC)
+    {
+        // Src and weights have the same dimension indices
+        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape src_shape = src->tensor_shape();
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      src_w     = src_shape[idx_w];
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if(depth_multiplier == 1)
+        {
+            if(desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                if((src_w % 5) == 0)
+                {
+                    desc.m0 = 5;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                }
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                       unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(wei);
+
+    DWCComputeKernelInfo desc;
+
+    if(src->data_layout() == DataLayout::NHWC)
+    {
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = false;
+        desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
+        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        {
+            desc.m0 = 2;
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                                                        unsigned int depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if(src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if(depth_multiplier == 1)
+        {
+            if(desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..4d51fa668c
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Valhall based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                   unsigned int depth_multiplier) override;
+
+private:
+    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                          unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
new file mode 100644
index 0000000000..5593c6de61
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
+{
+    // Check whether we can use the cl image with the weights.
+    if(!export_to_cl_image(weights))
+    {
+        return false;
+    }
+
+    const size_t idx_w    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t kernel_w = weights->tensor_shape()[idx_w];
+    const size_t kernel_h = weights->tensor_shape()[idx_h];
+
+    // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
+    // 1- When the kernel size is 1x1
+    // 2- When the depth multiplier is greater than 1 and not multiple of 4.
+    if((kernel_w == 1) && (kernel_h == 1))
+    {
+        return false;
+    }
+
+    if((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+    {
+        return false;
+    }
+
+    return true;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
new file mode 100644
index 0000000000..e3484c04ff
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensorInfo;
+
+namespace cl_dwc
+{
+/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance
+ *
+ * @param[in] weights          Weights TensorInfo of the depthwise convolution
+ * @param[in] depth_multiplier Depth multiplier
+ *
+ * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance
+ */
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier);
+
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..c08053dcb3
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** ClDWCNativeKernelConfigurationFactory factory class */
+class ClDWCNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClDWCNativeKernelConfig
+     */
+    static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
+    {
+        switch(get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                // The heuristic for Midgard is the same as the one used for Arm Mali-G71
+                return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71);
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu);
+            case GPUTarget::VALHALL:
+                return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..b5df132a12
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Basic container for the OpenCL depthwise convolution configuration functions */
+template <class T>
+class ClDWCNativeConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for depthwise convolution F32
+     * @param[in] func_f16  Function to call for depthwise convolution F16
+     * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8)
+        : _configs{ func_f32, func_f16, func_int8 }
+    {
+    }
+
+    /** Method to return the depthwise convolution configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch(data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the depthwise convolution kernel configuration */
+class IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClDWCNativeKernelConfig(GPUTarget arch)
+        : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClDWCNativeKernelConfig() = default;
+    /** This method returns the @ref DWCComputeKernelInfo for the given inputs
+     *
+     * @param[in] src              Source tensor (activation tensor)
+     * @param[in] wei              Weights tensor
+     * @param[in] conv_info        Convolution info
+     * @param[in] dilation         Kernel dilation
+     * @param[in] depth_multiplier Output feature maps multiplier
+     */
+    virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+                                           unsigned int depth_multiplier) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
index 5d3dbf3146..990f050112 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -157,5 +157,5 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_indirect_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index cd9a6a5c37..68dca91885 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -47,6 +47,6 @@ private:
     DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
     DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_indirect_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
index 13716efb5f..73fbb87560 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -56,6 +56,6 @@ public:
         }
     }
 };
-} // namespace opencl
+} // namespace cl_indirect_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
index e722488c3b..d2f4cde662 100644
--- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -45,8 +45,8 @@ public:
 
     /** Constructor
      *
-     * @param[in] func_f32 Function to call for direct convolution F32
-     * @param[in] func_f16 Function to call for direct convolution F16
+     * @param[in] func_f32 Function to call for indirect convolution F32
+     * @param[in] func_f16 Function to call for indirect convolution F16
      *
      */
     ClIndirectConvConfigArray(T func_f32, T func_f16)
@@ -103,6 +103,6 @@ public:
 protected:
     GPUTarget _target;
 };
-} // namespace opencl
+} // namespace cl_indirect_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */
-- 
cgit v1.2.1