28 files changed, 3532 insertions, 0 deletions
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..aba32871d0
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
+{
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
+        &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClDirectConvDefaultConfigBifrost::configure_default_f32,
+        &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G71:
+            func = configs_G71.get_function(src->data_type());
+            break;
+        default:
+            func = configs_default.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution");
+    return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 2;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 16;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 2;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = export_to_cl_image(wei);
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = export_to_cl_image(wei);
+    }
+
+    return desc;
+}
+} // namespace cl_direct_conv
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
new file mode 100644
index 0000000000..ed6a4c3c68
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Bifrost based OpenCL direct convolution configuration */
+class ClDirectConvDefaultConfigBifrost final : public IClDirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDirectConvDefaultConfigBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+    DirectConvComputeKernelInfo
+    configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..4b7666d5aa
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
+{
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(
+        &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G57:
+            func = configs_G57.get_function(src->data_type());
+            break;
+        case GPUTarget::G78:
+        default:
+            func = configs_G78.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution");
+    return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm          = dst_shape[0];
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            if (is_pointwise)
+            {
+                if (ofm == 4)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 4;
+                    desc.k0 = 16;
+                }
+                else
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                    desc.k0 = 16;
+                }
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = 2;
+                desc.k0 = 16;
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                desc.m0 = 1;
+                desc.n0 = 1;
+                desc.k0 = 16;
+            }
+            else
+            {
+                desc.m0 = 4;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm          = dst_shape[0];
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const int32_t k            = wei_shape[0];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            // k0 should be as larger as possible. However, we should avoid
+            // having left-over for loops that make the implementation slower.
+            if ((k % 16) == 0)
+            {
+                desc.k0 = 16;
+            }
+            else if ((k % 8) == 0)
+            {
+                desc.k0 = 8;
+            }
+            else
+            {
+                desc.k0 = 4;
+            }
+
+            if (is_pointwise)
+            {
+                if (ofm == 4)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 4;
+                }
+                else
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                }
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = dst_shape[0];
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                desc.m0 = 1;
+                desc.n0 = 1;
+                if ((k % 16) == 0)
+                {
+                    desc.k0 = 16;
+                }
+                else if ((k % 8) == 0)
+                {
+                    desc.k0 = 8;
+                }
+                else
+                {
+                    desc.k0 = 4;
+                }
+            }
+            else
+            {
+                if (ofm >= 16)
+                {
+                    if (m / 6 > 24000)
+                    {
+                        desc.m0 = 6;
+                    }
+                    else
+                    {
+                        desc.m0 = 5;
+                    }
+                    desc.n0 = 8;
+                    desc.k0 = 4;
+                }
+                else
+                {
+                    desc.m0 = 2;
+                    desc.n0 = 8;
+                    if ((k % 16) == 0)
+                    {
+                        desc.k0 = 16;
+                    }
+                    else if ((k % 8) == 0)
+                    {
+                        desc.k0 = 8;
+                    }
+                    else
+                    {
+                        desc.k0 = 4;
+                    }
+                }
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 16;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            if (is_pointwise)
+            {
+                desc.m0 = 1;
+                desc.n0 = 1;
+                desc.k0 = 16;
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = dst_shape[0];
+                desc.k0 = 16;
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                if (m == 1)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                    desc.k0 = 16;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                    desc.n0 = 2;
+                    desc.k0 = 8;
+                }
+            }
+            else
+            {
+                desc.m0 = 4;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm          = dst_shape[0];
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            if (is_pointwise)
+            {
+                desc.m0 = 2;
+                desc.n0 = 1;
+                desc.k0 = 16;
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = dst_shape[0];
+                desc.k0 = 16;
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                if (m == 1)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                    desc.k0 = 16;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                    desc.n0 = 2;
+                    desc.k0 = 8;
+                }
+            }
+            else
+            {
+                if (ofm > 16)
+                {
+                    desc.m0 = 4;
+                    desc.n0 = 8;
+                    desc.k0 = 8;
+                }
+                else
+                {
+                    desc.m0 = 8;
+                    desc.n0 = 4;
+                    desc.k0 = 4;
+                }
+            }
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_direct_conv
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
new file mode 100644
index 0000000000..efd879a567
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Valhall based OpenCL direct convolution configuration */
+class ClDirectConvDefaultConfigValhall final : public IClDirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDirectConvDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+    DirectConvComputeKernelInfo
+    configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
new file mode 100644
index 0000000000..215b17ef79
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
+
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** ClDirectConvolution factory class */
+class ClDirectConvKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClDirectConvolution kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClDirectConvKernelConfig
+     */
+    static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71);
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClDirectConvDefaultConfigBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClDirectConvDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
new file mode 100644
index 0000000000..e5b270c720
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Basic container for the OpenCL direct convolution configuration functions */
+template <class T>
+class ClDirectConvConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for direct convolution F32
+     * @param[in] func_f16  Function to call for direct convolution F16
+     * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+    {
+    }
+
+    /** Method to return the direct convolution configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the Direct convolution kernel configuration */
+class IClDirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClDirectConvKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClDirectConvKernelConfig() = default;
+    /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs
+     *
+     * @param[in] src       Source tensor (activation tensor)
+     * @param[in] wei       Weights tensor
+     * @param[in] conv_info Convolution info
+     */
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..98ebf3ebbe
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+namespace
+{
+DWCComputeKernelInfo configure_f32(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image = false;
+
+        if (is_g71)
+        {
+            desc.export_weights_to_cl_image = false;
+        }
+        else
+        {
+            desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+        }
+
+        if (depth_multiplier == 1)
+        {
+            desc.n0 = 4;
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo configure_f16(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Src and weights have the same dimension indices
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape src_shape = src->tensor_shape();
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      src_w     = src_shape[idx_w];
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image = false;
+
+        if (is_g71)
+        {
+            desc.export_weights_to_cl_image = false;
+        }
+        else
+        {
+            desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+        }
+
+        if (depth_multiplier == 1)
+        {
+            if (desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                if ((src_w % 5) == 0)
+                {
+                    desc.m0 = 5;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                }
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace
+
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
+{
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G71:
+            func = configs_G71.get_function(src->data_type());
+            break;
+        default:
+            func = configs_G7x.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+    return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(wei);
+
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = false;
+        desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        {
+            desc.m0 = 2;
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
new file mode 100644
index 0000000000..41d86c9c14
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Bifrost based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
+
+private:
+    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..ef1bb3858c
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
+{
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G77:
+            func = configs_G77.get_function(src->data_type());
+            break;
+        case GPUTarget::G78:
+        default:
+            func = configs_G78.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+    return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if (depth_multiplier == 1)
+        {
+            desc.n0 = 4;
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Src and weights have the same dimension indices
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape src_shape = src->tensor_shape();
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      src_w     = src_shape[idx_w];
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if (depth_multiplier == 1)
+        {
+            if (desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                if ((src_w % 5) == 0)
+                {
+                    desc.m0 = 5;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                }
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(wei);
+
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = false;
+        desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        {
+            desc.m0 = 2;
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if (depth_multiplier == 1)
+        {
+            if (desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..fabce77b54
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Valhall based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
+
+private:
+    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
new file mode 100644
index 0000000000..c8b006c546
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
+{
+    // Check whether we can use the cl image with the weights.
+    if (!export_to_cl_image(weights))
+    {
+        return false;
+    }
+
+    const size_t idx_w    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t kernel_w = weights->tensor_shape()[idx_w];
+    const size_t kernel_h = weights->tensor_shape()[idx_h];
+
+    // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
+    // 1- When the kernel size is 1x1
+    // 2- When the depth multiplier is greater than 1 and not multiple of 4.
+    if ((kernel_w == 1) && (kernel_h == 1))
+    {
+        return false;
+    }
+
+    if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+    {
+        return false;
+    }
+
+    return true;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
new file mode 100644
index 0000000000..e3484c04ff
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensorInfo;
+
+namespace cl_dwc
+{
+/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance
+ *
+ * @param[in] weights          Weights TensorInfo of the depthwise convolution
+ * @param[in] depth_multiplier Depth multiplier
+ *
+ * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance
+ */
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier);
+
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..031cf1859a
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** ClDWCNativeKernelConfigurationFactory factory class */
+class ClDWCNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClDWCNativeKernelConfig
+     */
+    static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                // The heuristic for Midgard is the same as the one used for Arm Mali-G71
+                return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71);
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..614a6622df
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Basic container for the OpenCL depthwise convolution configuration functions */
+template <class T>
+class ClDWCNativeConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for depthwise convolution F32
+     * @param[in] func_f16  Function to call for depthwise convolution F16
+     * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+    {
+    }
+
+    /** Method to return the depthwise convolution configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the depthwise convolution kernel configuration */
+class IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClDWCNativeKernelConfig() = default;
+    /** This method returns the @ref DWCComputeKernelInfo for the given inputs
+     *
+     * @param[in] src              Source tensor (activation tensor)
+     * @param[in] wei              Weights tensor
+     * @param[in] conv_info        Convolution info
+     * @param[in] dilation         Kernel dilation
+     * @param[in] depth_multiplier Output feature maps multiplier
+     */
+    virtual DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..3380d8f1b7
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                          const ITensorInfo   *wei,
+                                                                          const PadStrideInfo &conv_info)
+{
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
+
+    // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
+    // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
+    // for the pointwise convolution cases.
+
+    ConfigurationFunctionExecutorPtr func = configs_G77.get_function(src->data_type());
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for indirect convolution");
+    return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+        const int32_t     stride_x                   = conv_info.stride().first;
+        const int32_t     stride_y                   = conv_info.stride().second;
+        const int32_t     ofm                        = dst_shape[0];
+        const int32_t     m                          = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y);
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (ofm <= 4)
+        {
+            desc.m0 = 1;
+            desc.n0 = 2;
+            desc.k0 = 16;
+        }
+        else
+        {
+            // The 16000 threshold value has been identified as the right
+            // one for using the biggest block size allowed on F32: 5x4x4
+            if (m < 16000)
+            {
+                desc.m0 = 4;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+            else
+            {
+                desc.m0 = 5;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm = dst_shape[0];
+        const int32_t m   = dst_shape[1] * dst_shape[2];
+        const int32_t k   = wei_shape[0];
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (ofm <= 4)
+        {
+            // k0 should be as larger as possible. However, we should avoid
+            // having left-over for loops that make the implementation slower.
+            if ((k % 16) == 0)
+            {
+                desc.k0 = 16;
+            }
+            else if ((k % 8) == 0)
+            {
+                desc.k0 = 8;
+            }
+            else
+            {
+                desc.k0 = 4;
+            }
+
+            desc.m0 = 1;
+            desc.n0 = ofm;
+        }
+        else
+        {
+            // The 16000 threshold value has been identified as the right
+            // one for using the biggest block size allowed on F16: 8x4
+            if (m >= 16000 && k < 4)
+            {
+                desc.m0 = 8;
+                desc.n0 = 4;
+                desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4
+            }
+            else
+            {
+                desc.m0 = 5;
+                desc.n0 = 4;
+                desc.k0 = 8;
+            }
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_indirect_conv
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
new file mode 100644
index 0000000000..bab808c66c
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+/** Valhall based OpenCL indirect convolution configuration */
+class ClIndirectConvDefaultConfigValhall final : public IClIndirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClIndirectConvDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+    DirectConvComputeKernelInfo
+    configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_indirect_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
new file mode 100644
index 0000000000..5e7ba6f8e9
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
+
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+/** ClIndirectConvolution factory class */
+class ClIndirectConvKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClIndirectConvolution kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClIndirectConvKernelConfig
+     */
+    static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClIndirectConvDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_indirect_conv
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
new file mode 100644
index 0000000000..d05da18b58
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+/** Basic container for the OpenCL indirect convolution configuration functions */
+template <class T>
+class ClIndirectConvConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+
+    /** Constructor
+     *
+     * @param[in] func_f32 Function to call for indirect convolution F32
+     * @param[in] func_f16 Function to call for indirect convolution F16
+     *
+     */
+    ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16}
+    {
+    }
+
+    /** Method to return the indirect convolution configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 2> _configs;
+};
+
+/** Basic interface for the indirect convolution kernel configuration */
+class IClIndirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClIndirectConvKernelConfig() = default;
+    /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs
+     *
+     * @param[in] src       Source tensor (activation tensor)
+     * @param[in] wei       Weights tensor
+     * @param[in] conv_info Convolution info
+     */
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_indirect_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..3a02a60650
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu)
+{
+}
+
+MatMulKernelInfo
+ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
+{
+    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G715:
+        case GPUTarget::G615:
+            func = configs_G715.get_function(lhs->data_type());
+            break;
+        case GPUTarget::G710:
+        default:
+            func = configs_G710.get_function(lhs->data_type());
+            break;
+    }
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    TensorShape lhs_shape = lhs->tensor_shape();
+    TensorShape rhs_shape = rhs->tensor_shape();
+
+    const bool is_batched = lhs_shape.num_dimensions() > 2;
+
+    if (is_batched == true)
+    {
+        lhs_shape.collapse_from(2);
+    }
+
+    const unsigned int m = adj_lhs ? lhs_shape.x() : lhs_shape.y();
+    const unsigned int n = adj_rhs ? rhs_shape.y() : rhs_shape.x();
+    const unsigned int k = adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    const unsigned int b = lhs_shape.z();
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native");
+    return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+    return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    return configure_G715_f32(m, n, k, b, rhs_lock_padding, info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+    return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 4, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1},
+        {1568, 64, 40, 36, 2, 8, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1},  {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
+    const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
+
+    if ((adj_lhs == false) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
+    }
+    else if ((adj_lhs == false) && (adj_rhs == true))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
+    }
+    else if ((adj_lhs == true) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
+    }
+    else
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_t;
+    }
+
+    MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
+    MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
+
+    return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 16, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0},  {688, 92, 68, 32, 6, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0},
+        {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1},
+        {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0},   {688, 92, 68, 32, 6, 2, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0},
+        {1568, 64, 40, 36, 5, 4, 8, 0},  {2920, 64, 64, 24, 6, 2, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 2, 8, 1},  {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
+    const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
+
+    if ((adj_lhs == false) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
+    }
+    else if ((adj_lhs == false) && (adj_rhs == true))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
+    }
+    else if ((adj_lhs == true) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
+    }
+    else
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_t;
+    }
+
+    MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
+    MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
+
+    return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    ARM_COMPUTE_UNUSED(rhs_lock_padding);
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0},  {688, 92, 68, 32, 2, 8, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0},  {688, 92, 68, 32, 4, 4, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0},  {2920, 64, 64, 24, 4, 4, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},   {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0},  {2920, 64, 64, 24, 4, 2, 16, 0}};
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    if ((adj_lhs == false) && (adj_rhs == false))
+    {
+        return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b);
+    }
+    else if ((adj_lhs == false) && (adj_rhs == true))
+    {
+        return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b);
+    }
+    else if ((adj_lhs == true) && (adj_rhs == false))
+    {
+        return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b);
+    }
+    else
+    {
+        return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b);
+    }
+}
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..5279871057
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Valhall based OpenCL matmul configuration */
+class ClMatMulNativeDefaultConfigValhall final : public IClMatMulNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClMatMulNativeDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override;
+
+private:
+    MatMulKernelInfo configure_G710_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
new file mode 100644
index 0000000000..3878f698fd
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu)
+    : IClMatMulNativeKernelVariant(gpu)
+{
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo         *lhs,
+                                                                    const ITensorInfo         *rhs,
+                                                                    const MatMulInfo          &info,
+                                                                    const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(rhs);
+
+    using VariantFunctionExecutorPtr =
+        MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled);
+
+    ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_G715(
+        &ClMatMulNativeDefaultVariantValhall::configure_G715_float,
+        &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized);
+
+    ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_default(
+        &ClMatMulNativeDefaultVariantValhall::configure_default_float,
+        &ClMatMulNativeDefaultVariantValhall::configure_default_quantized);
+
+    VariantFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G715:
+        case GPUTarget::G615:
+            func = configs_G715.get_function(lhs->data_type());
+            break;
+        default:
+            func = configs_default.get_function(lhs->data_type());
+            break;
+    }
+
+    const int  k           = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+    const bool act_enabled = act_info.enabled();
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native");
+    return (this->*func)(k, act_enabled);
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled)
+{
+    // MMUL kernel works only when K is a multiple of 4
+    if (!act_enabled && k % 4 == 0)
+    {
+        return MatMulKernelType::NATIVE_MMUL_FP;
+    }
+
+    return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled)
+{
+    // MMUL kernel works only when K is a multiple of 16
+    if (!act_enabled && k % 16 == 0)
+    {
+        return MatMulKernelType::NATIVE_MMUL_QUANTIZED;
+    }
+
+    return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled)
+{
+    ARM_COMPUTE_UNUSED(k, act_enabled);
+
+    return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled)
+{
+    ARM_COMPUTE_UNUSED(k, act_enabled);
+
+    return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
new file mode 100644
index 0000000000..a202676e98
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Valhall based OpenCL matmul configuration */
+class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClMatMulNativeDefaultVariantValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    MatMulKernelType select_kernel(const ITensorInfo         *lhs,
+                                   const ITensorInfo         *rhs,
+                                   const MatMulInfo          &info,
+                                   const ActivationLayerInfo &act_info) override;
+
+private:
+    MatMulKernelType configure_G715_float(int k, bool act_enabled);
+    MatMulKernelType configure_G715_quantized(int k, bool act_enabled);
+    MatMulKernelType configure_default_float(int k, bool act_enabled);
+    MatMulKernelType configure_default_quantized(int k, bool act_enabled);
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
new file mode 100644
index 0000000000..89cad30214
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+
+#include <limits>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
+                             const MatMulKernelInfo &info1,
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true,
+                             "The fallback MatMul configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs,
+                             "The MatMul configurations must have the same adj_lhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs,
+                             "The MatMul configurations must have the same adj_rhs value");
+
+    const bool adj_lhs = info0.adj_lhs;
+    const bool adj_rhs = info0.adj_rhs;
+
+    TensorInfo lhs_info =
+        !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
+    TensorInfo rhs_info =
+        !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
+    TensorInfo dst_info;
+
+    if (rhs_lock_padding == false)
+    {
+        if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
+        {
+            return info0;
+        }
+        else
+        {
+            return info1;
+        }
+    }
+    else
+    {
+        return info1;
+    }
+}
+
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b)
+{
+    size_t min_acc = std::numeric_limits<size_t>::max();
+    size_t min_idx = 0;
+
+    ARM_COMPUTE_ERROR_ON(configs.size() == 0);
+    const size_t num_rows = configs.size();
+    const size_t num_cols = configs[0].size();
+
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U,
+                             "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
+    ARM_COMPUTE_UNUSED(num_cols);
+
+    // Find nearest GeMM workload
+    // Note: the workload does not depend on the K dimension
+    for (size_t y = 0; y < num_rows; ++y)
+    {
+        size_t mc0 = static_cast<size_t>(configs[y][0]);
+        size_t nc0 = static_cast<size_t>(configs[y][1]);
+        size_t kc0 = static_cast<size_t>(configs[y][2]);
+        size_t bc0 = static_cast<size_t>(configs[y][3]);
+
+        size_t acc = 0;
+        acc += (m - mc0) * (m - mc0);
+        acc += (n - nc0) * (n - nc0);
+        acc += (k - kc0) * (k - kc0);
+        acc += (b - bc0) * (b - bc0);
+        acc = std::sqrt(acc);
+        if (acc < min_acc)
+        {
+            min_acc = acc;
+            min_idx = y;
+        }
+    }
+
+    // Get the configuration from the nearest GeMM shape
+    MatMulKernelInfo desc;
+    desc.adj_lhs                = adj_lhs;
+    desc.adj_rhs                = adj_rhs;
+    desc.m0                     = configs[min_idx][4];
+    desc.n0                     = configs[min_idx][5];
+    desc.k0                     = configs[min_idx][6];
+    desc.export_rhs_to_cl_image = configs[min_idx][7];
+
+    return desc;
+}
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
new file mode 100644
index 0000000000..699f5fe8c1
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+// Forward declaration
+struct MatMulKernelInfo;
+
+namespace cl_matmul
+{
+using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>;
+
+/** This function accepts two MatMulKernelInfo objects where only the first can be with cl_image2d support enabled.
+ *  The aim of this function is to check whether the first MatMulKernelInfo object is valid. If not, the function will
+ *  return the second MatMulKernelInfo object. Otherwise, the first one.
+ *
+ * @param[in] info0            MatMulKernelInfo with cl_image2d support
+ * @param[in] info1            MatMulKernelInfo to fall-back if cl_image2d cannot be used
+ * @param[in] m                Number of rows (M) of the LHS matrix
+ * @param[in] n                Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k                Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b                Batch size
+ * @param[in] data_type        Data type
+ * @param[in] rhs_lock_padding Flag used to know whether the RHS paddings are locked
+ *
+ * @return @ref MatMulKernelInfo
+ */
+MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
+                             const MatMulKernelInfo &info1,
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding);
+
+/** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user
+ *
+ * @param[in] configs List of best configurations for a limited number of MatMul shapes
+ * @param[in] adj_lhs Adjoint LHS flag value
+ * @param[in] adj_rhs Adjoint RHS flag value
+ * @param[in] m       Number of rows (M) of the LHS matrix
+ * @param[in] n       Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k       Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b       Batch size
+ *
+ * @return @ref MatMulKernelInfo
+ */
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b);
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
new file mode 100644
index 0000000000..e7485bca81
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
+
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** ClMatMul configuration factory class */
+class ClMatMulNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClMatMul configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClMatMulNativeKernelConfig
+     */
+    static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClMatMulNativeDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..c2895b8919
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+
+/** ClMatMul variant factory class */
+class ClMatMulNativeKernelVariantFactory final
+{
+public:
+    /** Static method to call the ClMatMul configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClMatMulNativeKernelVariant
+     */
+    static std::unique_ptr<IClMatMulNativeKernelVariant> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClMatMulNativeDefaultVariantValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
new file mode 100644
index 0000000000..00ba3641d5
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Basic container for the OpenCL MatMul Native configuration functions */
+template <class T>
+class ClMatMulNativeConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for matmul native F32
+     * @param[in] func_f16  Function to call for matmul native F16
+     * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+    {
+    }
+
+    /** Method to return the matmul native configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the matmul native kernel configuration
+ *  This is the base class that chooses architecture specific kernel configurations.
+*/
+class IClMatMulNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClMatMulNativeKernelConfig() = default;
+    /** This method returns the @ref MatMulKernelInfo for the given inputs
+     *
+     * @param[in] lhs  LHS tensor
+     * @param[in] rhs  RHS tensor
+     * @param[in] info MatMul info
+     */
+    virtual MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..eac41dd6a3
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+
+#include "arm_compute/core/CoreTypes.h" // DataType
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/core/common/Macros.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+enum class MatMulKernelType
+{
+    /** Native matrix multiplication for FP types */
+    NATIVE_FP,
+
+    /** Native matrix multiplication for quantized types */
+    NATIVE_QUANTIZED,
+
+    /** Native matrix multiplication using MMUL extension for FP types */
+    NATIVE_MMUL_FP,
+
+    /** Native matrix multiplication using MMUL extension for Quantized types */
+    NATIVE_MMUL_QUANTIZED
+};
+
+/** Basic container for the OpenCL MatMul Native variant functions */
+template <class T>
+class ClMatMulNativeVariantArray
+{
+public:
+    /** Alias for Float index */
+    static constexpr size_t DT_FLOAT = 0;
+    /** Alias for Quantized type index */
+    static constexpr size_t DT_QUANTIZED = 1;
+
+    /** Constructor
+     *
+     * @param[in] func_float     Function to call for matmul native float (F32, F16)
+     * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized}
+    {
+    }
+
+    /** Method to return the matmul native variant function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+            case DataType::F16:
+                return _configs.at(DT_FLOAT);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_QUANTIZED);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 2> _configs;
+};
+
+/** Basic interface for the matmul native kernel variant
+ *  This is the base class that chooses architecture specific kernel variants.
+*/
+class IClMatMulNativeKernelVariant
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant);
+    /** Virtual destructor */
+    virtual ~IClMatMulNativeKernelVariant() = default;
+    /** This method returns the @ref MatMulKernelType for the given inputs
+     *
+     * @param[in] lhs      LHS tensor
+     * @param[in] rhs      RHS tensor
+     * @param[in] info     MatMul info
+     * @param[in] act_info Activation layer info
+     */
+    virtual MatMulKernelType select_kernel(const ITensorInfo         *lhs,
+                                           const ITensorInfo         *rhs,
+                                           const MatMulInfo          &info,
+                                           const ActivationLayerInfo &act_info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H