23 files changed, 598 insertions, 575 deletions
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
index 1bfb8124e9..aba32871d0 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@ namespace cl_direct_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDirectConvDefaultConfigBifrost::configure_G71_f32,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
+        &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClDirectConvDefaultConfigBifrost::configure_default_f32,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_default_f16,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClDirectConvDefaultConfigBifrost::configure_default_f32,
+        &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -69,18 +72,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const IT
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -93,18 +98,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -117,18 +124,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -141,18 +150,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(c
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -165,18 +176,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -188,5 +201,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
index 6b60b2c007..ed6a4c3c68 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
@@ -41,15 +41,21 @@ public:
     ClDirectConvDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
index 8f2fd82412..4b7666d5aa 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@ namespace cl_direct_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDirectConvDefaultConfigValhall::configure_G78_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(&ClDirectConvDefaultConfigValhall::configure_G57_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G57_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(
+        &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G57:
             func = configs_G57.get_function(src->data_type());
@@ -70,15 +73,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const IT
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -87,11 +92,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -113,7 +118,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -131,15 +136,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -149,15 +156,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -166,9 +173,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
                 desc.k0 = 4;
             }
 
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -187,15 +194,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
-                if((k % 16) == 0)
+                if ((k % 16) == 0)
                 {
                     desc.k0 = 16;
                 }
-                else if((k % 8) == 0)
+                else if ((k % 8) == 0)
                 {
                     desc.k0 = 8;
                 }
@@ -206,9 +213,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
             }
             else
             {
-                if(ofm >= 16)
+                if (ofm >= 16)
                 {
-                    if(m / 6 > 24000)
+                    if (m / 6 > 24000)
                     {
                         desc.m0 = 6;
                     }
@@ -223,11 +230,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
                 {
                     desc.m0 = 2;
                     desc.n0 = 8;
-                    if((k % 16) == 0)
+                    if ((k % 16) == 0)
                     {
                         desc.k0 = 16;
                     }
-                    else if((k % 8) == 0)
+                    else if ((k % 8) == 0)
                     {
                         desc.k0 = 8;
                     }
@@ -243,18 +250,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -267,15 +276,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(c
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t m            = dst_shape[1] * dst_shape[2];
@@ -283,9 +294,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -300,9 +311,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -327,15 +338,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -344,9 +357,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 2;
                 desc.n0 = 1;
@@ -361,9 +374,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -378,7 +391,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
             }
             else
             {
-                if(ofm > 16)
+                if (ofm > 16)
                 {
                     desc.m0 = 4;
                     desc.n0 = 8;
@@ -396,5 +409,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
index f9d5c5299e..efd879a567 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
@@ -41,15 +41,21 @@ public:
     ClDirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
index 232167fc59..2c2509f70b 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
@@ -46,7 +46,7 @@ public:
      */
     static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71);
@@ -59,6 +59,6 @@ public:
         }
     }
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
index 6104d73594..e5b270c720 100644
--- a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@ public:
      * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClDirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig);
@@ -105,11 +104,12 @@ public:
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
index 5311fdcec3..98ebf3ebbe 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,28 +29,34 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
 namespace
 {
-DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f32(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -60,17 +65,17 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -81,14 +86,15 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -106,16 +112,20 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
     return desc;
 }
 
-DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f16(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -124,7 +134,7 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -133,9 +143,9 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -146,11 +156,11 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -161,20 +171,21 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -194,27 +205,30 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
 }
 } // namespace
 
-ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -228,43 +242,58 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInf
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
index cec2cae5dd..41d86c9c14 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -41,20 +41,38 @@ public:
     ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
index 51f3787875..ef1bb3858c 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,31 +29,36 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
-ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G77:
             func = configs_G77.get_function(src->data_type());
@@ -69,15 +73,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInf
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -85,17 +92,17 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -106,14 +113,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -131,16 +139,19 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -150,9 +161,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -163,11 +174,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -178,20 +189,21 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -210,19 +222,22 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                       unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }
@@ -235,15 +250,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITe
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -251,9 +269,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -264,11 +282,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -279,14 +297,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
index 4d51fa668c..fabce77b54 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -41,18 +41,33 @@ public:
     ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
index 5593c6de61..c8b006c546 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -32,7 +32,7 @@ namespace cl_dwc
 bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
 {
     // Check whether we can use the cl image with the weights.
-    if(!export_to_cl_image(weights))
+    if (!export_to_cl_image(weights))
     {
         return false;
     }
@@ -45,12 +45,12 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul
     // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
     // 1- When the kernel size is 1x1
     // 2- When the depth multiplier is greater than 1 and not multiple of 4.
-    if((kernel_w == 1) && (kernel_h == 1))
+    if ((kernel_w == 1) && (kernel_h == 1))
     {
         return false;
     }
 
-    if((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+    if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
     {
         return false;
     }
@@ -58,4 +58,4 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul
     return true;
 }
 } // namespace cl_dwc
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
index c08053dcb3..49ce6ff479 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -46,7 +46,7 @@ public:
      */
     static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 // The heuristic for Midgard is the same as the one used for Arm Mali-G71
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
index b5df132a12..614a6622df 100644
--- a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@ public:
      * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClDWCNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
@@ -107,8 +106,11 @@ public:
      * @param[in] dilation         Kernel dilation
      * @param[in] depth_multiplier Output feature maps multiplier
      */
-    virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier) = 0;
+    virtual DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
index 990f050112..3380d8f1b7 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -35,17 +35,19 @@ namespace cl_indirect_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClIndirectConvKernelConfig(gpu)
+ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                          const ITensorInfo   *wei,
+                                                                          const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClIndirectConvDefaultConfigValhall::configure_G77_f32,
-                                                                            &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
+    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
 
     // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
     // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
@@ -57,22 +59,24 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
-        const int32_t stride_x     = conv_info.stride().first;
-        const int32_t stride_y     = conv_info.stride().second;
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = (dst_shape[1]/ stride_x) * (dst_shape[2] / stride_y);
+        const int32_t     stride_x                   = conv_info.stride().first;
+        const int32_t     stride_y                   = conv_info.stride().second;
+        const int32_t     ofm                        = dst_shape[0];
+        const int32_t     m                          = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y);
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             desc.m0 = 1;
             desc.n0 = 2;
@@ -82,7 +86,7 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F32: 5x4x4
-            if(m < 16000)
+            if (m < 16000)
             {
                 desc.m0 = 4;
                 desc.n0 = 4;
@@ -100,31 +104,33 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3
     return desc;
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = dst_shape[1] * dst_shape[2];
-        const int32_t k            = wei_shape[0];
+        const int32_t ofm = dst_shape[0];
+        const int32_t m   = dst_shape[1] * dst_shape[2];
+        const int32_t k   = wei_shape[0];
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -140,11 +146,11 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F16: 8x4
-            if(m >= 16000 && k < 4)
+            if (m >= 16000 && k < 4)
             {
                 desc.m0 = 8;
                 desc.n0 = 4;
-                desc.k0 = 4;    // k0 is clamped to k inside the kernel when k is less than 4
+                desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4
             }
             else
             {
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index 68dca91885..bab808c66c 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -41,11 +41,14 @@ public:
     ClIndirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
 } // namespace cl_indirect_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
index 73fbb87560..dd614e1f68 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -45,7 +45,7 @@ public:
      */
     static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
index d2f4cde662..d05da18b58 100644
--- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -49,8 +50,7 @@ public:
      * @param[in] func_f16 Function to call for indirect convolution F16
      *
      */
-    ClIndirectConvConfigArray(T func_f32, T func_f16)
-        : _configs{ func_f32, func_f16}
+    ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16}
     {
     }
 
@@ -62,7 +62,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -85,8 +85,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClIndirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig);
@@ -98,7 +97,8 @@ public:
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
index 01102b3d60..b3c8d891dc 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -28,30 +28,33 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
-#include <utility>
 
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
 
+#include <utility>
+
 namespace arm_compute
 {
 namespace cl_matmul
 {
-ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClMatMulNativeKernelConfig(gpu)
+ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu)
 {
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
+MatMulKernelInfo
+ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
 {
-    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo & info);
+    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 
-    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G710:
         default:
@@ -67,7 +70,7 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo
 
     const bool is_batched = lhs_shape.num_dimensions() > 2;
 
-    if(is_batched == true)
+    if (is_batched == true)
     {
         lhs_shape.collapse_from(2);
     }
@@ -81,103 +84,48 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo
     return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 4, 16, 1 },
-        { 1568, 64, 40, 36, 2, 8, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 2, 2, 16, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 5, 4, 4, 0 },
-        { 4096, 48, 32, 36, 5, 4, 4, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 4, 0 },
-        { 112, 184, 144, 28, 5, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 5, 4, 4, 0 },
-        { 2920, 64, 64, 24, 6, 2, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 2, 2, 16, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 2, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1},
+        {1568, 64, 40, 36, 2, 8, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1},  {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -185,17 +133,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -209,108 +157,51 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F32, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 16, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 0 },
-        { 4096, 48, 32, 36, 6, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 8, 0 },
-        { 5776, 64, 32, 36, 6, 4, 8, 0 },
-        { 1568, 64, 40, 36, 6, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 1 },
-        { 4096, 48, 32, 36, 6, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 4, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 6, 4, 8, 1 },
-        { 1568, 64, 40, 36, 6, 4, 8, 1 },
-        { 2920, 64, 64, 24, 6, 4, 8, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 2, 16, 0 },
-        { 4096, 48, 32, 36, 5, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 2, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 6, 2, 16, 0 },
-        { 5776, 64, 32, 36, 5, 4, 8, 0 },
-        { 1568, 64, 40, 36, 5, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 2, 16, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 16, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 16, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0},  {688, 92, 68, 32, 6, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0},
+        {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1},
+        {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0},   {688, 92, 68, 32, 6, 2, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0},
+        {1568, 64, 40, 36, 5, 4, 8, 0},  {2920, 64, 64, 24, 6, 2, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 2, 8, 1},  {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -318,17 +209,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -342,75 +233,46 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F16, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
     ARM_COMPUTE_UNUSED(rhs_lock_padding);
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 4, 0 },
-        { 4096, 48, 32, 36, 6, 4, 4, 0 },
-        { 688, 92, 68, 32, 2, 8, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 6, 4, 4, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 5, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 16, 0 },
-        { 688, 92, 68, 32, 4, 4, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 4, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 16, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 16, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 2, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 2, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 2, 16, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0},  {688, 92, 68, 32, 2, 8, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0},  {688, 92, 68, 32, 4, 4, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0},  {2920, 64, 64, 24, 4, 4, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},   {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0},  {2920, 64, 64, 24, 4, 2, 16, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
@@ -419,5 +281,5 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned
         return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b);
     }
 }
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
index fe167d18dd..6b39db6a3f 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
@@ -44,10 +44,13 @@ public:
     MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override;
 
 private:
-    MatMulKernelInfo configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
index 1e06e84d4d..89cad30214 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <limits>
@@ -37,22 +38,32 @@ namespace cl_matmul
 {
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding)
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, "The fallback MatMul configuration cannot have export_to_cl_image = true");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, "The MatMul configurations must have the same adj_lhs value");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, "The MatMul configurations must have the same adj_rhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true,
+                             "The fallback MatMul configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs,
+                             "The MatMul configurations must have the same adj_lhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs,
+                             "The MatMul configurations must have the same adj_rhs value");
 
     const bool adj_lhs = info0.adj_lhs;
     const bool adj_rhs = info0.adj_rhs;
 
-    TensorInfo lhs_info = !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
-    TensorInfo rhs_info = !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
+    TensorInfo lhs_info =
+        !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
+    TensorInfo rhs_info =
+        !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
     TensorInfo dst_info;
 
-    if(rhs_lock_padding == false)
+    if (rhs_lock_padding == false)
     {
-        if(bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
+        if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
         {
             return info0;
         }
@@ -67,7 +78,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
     }
 }
 
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -76,12 +93,13 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U,
+                             "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -94,7 +112,7 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
index 3881617558..a114fffa68 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
@@ -52,7 +52,12 @@ using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>;
  */
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding);
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding);
 
 /** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user
  *
@@ -66,7 +71,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
  *
  * @return @ref MatMulKernelInfo
  */
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b);
 } // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
index a2dbfc7dd5..b10018a6d2 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
@@ -45,7 +45,7 @@ public:
      */
     static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
@@ -56,6 +56,6 @@ public:
         }
     }
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
index 4f548bd01d..b9b091100c 100644
--- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -53,8 +54,7 @@ public:
      * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -66,7 +66,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -93,8 +93,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClMatMulNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig);