206 files changed, 6860 insertions, 4357 deletions
diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp
index d8ef18e62e..611c1cb501 100644
--- a/src/gpu/cl/ClContext.cpp
+++ b/src/gpu/cl/ClContext.cpp
@@ -23,11 +23,11 @@
  */
 #include "src/gpu/cl/ClContext.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/ClQueue.h"
 #include "src/gpu/cl/ClTensor.h"
 
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -41,7 +41,7 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
     bool                 status = false;
     mlgo::MLGOHeuristics heuristics;
 
-    if(filename != nullptr)
+    if (filename != nullptr)
     {
         status = heuristics.reload_from_file(filename);
     }
@@ -50,12 +50,9 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
 } // namespace
 
 ClContext::ClContext(const AclContextOptions *options)
-    : IContext(Target::GpuOcl),
-      _mlgo_heuristics(),
-      _cl_ctx(),
-      _cl_dev()
+    : IContext(Target::GpuOcl), _mlgo_heuristics(), _cl_ctx(), _cl_dev()
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _mlgo_heuristics = populate_mlgo(options->kernel_config_file);
     }
@@ -80,7 +77,7 @@ const mlgo::MLGOHeuristics &ClContext::mlgo() const
 
 bool ClContext::set_cl_ctx(::cl::Context ctx)
 {
-    if(this->refcount() == 0)
+    if (this->refcount() == 0)
     {
         _cl_ctx = ctx;
         CLScheduler::get().set_context(ctx);
@@ -92,7 +89,7 @@ bool ClContext::set_cl_ctx(::cl::Context ctx)
 ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     ClTensor *tensor = new ClTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }
diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h
index a50b03124b..2c67ccf4d2 100644
--- a/src/gpu/cl/ClContext.h
+++ b/src/gpu/cl/ClContext.h
@@ -24,11 +24,11 @@
 #ifndef SRC_GPU_CLCONTEXT_H
 #define SRC_GPU_CLCONTEXT_H
 
+#include "arm_compute/core/CL/OpenCL.h"
+
 #include "src/common/IContext.h"
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
-#include "arm_compute/core/CL/OpenCL.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -74,9 +74,9 @@ public:
     bool set_cl_ctx(::cl::Context ctx);
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
-    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                           const AclTensorDescriptor     &dst,
                                                           const AclActivationDescriptor &act,
                                                           bool                           is_validate) override;
@@ -90,4 +90,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLCONTEXT_H */
-\ No newline at end of file
+#endif /* SRC_GPU_CLCONTEXT_H */
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 73bb96298e..bcade94522 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -37,24 +37,16 @@
 namespace
 {
 /* Decoding table */
-constexpr std::array<uint8_t, 256> b64_invtab =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
-    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
-    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
-    0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+constexpr std::array<uint8_t, 256> b64_invtab = {
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  62, 0,  0,  0,  63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+    0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 0,  0,  0,  0, 0, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 };
 
 /** Decode a base64 encoded string
@@ -68,13 +60,13 @@ std::string decode_base64(const std::string &str)
     constexpr const char pad_char = '=';
 
     // Handle empty string
-    if(str.empty())
+    if (str.empty())
     {
         return {};
     }
 
     // Base64 encoded string has size multiple of 4
-    if(str.length() % 4)
+    if (str.length() % 4)
     {
         return {};
     }
@@ -92,7 +84,7 @@ std::string decode_base64(const std::string &str)
     // Block decoding function (exclude padding)
     int       c   = 0;
     const int end = str_len - 4 - padding;
-    for(; c <= end; c += 4)
+    for (; c <= end; c += 4)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -105,7 +97,7 @@ std::string decode_base64(const std::string &str)
     }
 
     // Last step that might contain padding symbols
-    if(padding == 1)
+    if (padding == 1)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -114,7 +106,7 @@ std::string decode_base64(const std::string &str)
         dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
         dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
     }
-    else if(padding == 2)
+    else if (padding == 2)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -135,7 +127,7 @@ std::string decompress_zlib(const std::string &str)
 {
     // Create and initialize decompression stream
     z_stream ds{};
-    if(inflateInit(&ds) != Z_OK)
+    if (inflateInit(&ds) != Z_OK)
     {
         return std::string();
     }
@@ -152,16 +144,15 @@ std::string decompress_zlib(const std::string &str)
         ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
 
         status = inflate(&ds, 0);
-        if(inflated_str.size() < ds.total_out)
+        if (inflated_str.size() < ds.total_out)
         {
             inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
         }
-    }
-    while(status == Z_OK);
+    } while (status == Z_OK);
 
     // Finalize decompression stream
     inflateEnd(&ds);
-    if(status != Z_STREAM_END)
+    if (status != Z_STREAM_END)
     {
         return std::string();
     }
@@ -175,323 +166,321 @@ namespace arm_compute
 {
 namespace opencl
 {
-const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = {
     // Common Kernels
-    { "activation_layer", "common/activation_layer.cl" },
-    { "activation_layer_quant", "common/activation_layer_quant.cl" },
-    { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
-    { "arg_min_max_x", "common/arg_min_max.cl" },
-    { "arg_min_max_y", "common/arg_min_max.cl" },
-    { "arg_min_max_z", "common/arg_min_max.cl" },
-    { "arg_min_max_w", "common/arg_min_max.cl" },
-    { "bitwise_or", "common/bitwise_op.cl" },
-    { "bitwise_and", "common/bitwise_op.cl" },
-    { "bitwise_xor", "common/bitwise_op.cl" },
-    { "bitwise_not", "common/bitwise_op.cl" },
-    { "bounding_box_transform", "common/bounding_box_transform.cl" },
-    { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
-    { "compare_equal", "common/comparisons.cl" },
-    { "compare_equal_quantized", "common/comparisons.cl" },
-    { "compare_notequal", "common/comparisons.cl" },
-    { "compare_notequal_quantized", "common/comparisons.cl" },
-    { "compare_greater", "common/comparisons.cl" },
-    { "compare_greater_quantized", "common/comparisons.cl" },
-    { "compare_greaterequal", "common/comparisons.cl" },
-    { "compare_greaterequal_quantized", "common/comparisons.cl" },
-    { "compare_less", "common/comparisons.cl" },
-    { "compare_less_quantized", "common/comparisons.cl" },
-    { "compare_lessequal", "common/comparisons.cl" },
-    { "compare_lessequal_quantized", "common/comparisons.cl" },
-    { "concatenate", "common/concatenate.cl" },
-    { "concatenate_width", "common/concatenate.cl" },
-    { "concatenate_height", "common/concatenate.cl" },
-    { "concatenate_width_x2", "common/concatenate.cl" },
-    { "concatenate_width_x4", "common/concatenate.cl" },
-    { "col2im", "common/col2im.cl" },
-    { "cast_down", "common/cast.cl" },
-    { "cast_up", "common/cast.cl" },
-    { "convert_fc_weights", "common/convert_fc_weights.cl" },
-    { "copy_tensor", "common/copy_tensor.cl" },
-    { "crop_tensor", "common/crop_tensor.cl" },
-    { "deconvolution_reshape", "common/deconvolution_layer.cl" },
-    { "deconvolution_upsample", "common/deconvolution_layer.cl" },
-    { "dequantization_layer", "common/dequantization_layer.cl" },
-    { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
-    { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
-    { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
-    { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
-    { "elementwise_operation_AND", "common/elementwise_operation.cl" },
-    { "elementwise_operation_OR", "common/elementwise_operation.cl" },
-    { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_unary", "common/elementwise_unary.cl" },
-    { "elementwise_unary_quantized", "common/elementwise_unary_quantized.cl" },
-    { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
-    { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
-    { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_2_axis_0", "common/fft.cl" },
-    { "fft_radix_2_axis_1", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_3_axis_0", "common/fft.cl" },
-    { "fft_radix_3_axis_1", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_4_axis_0", "common/fft.cl" },
-    { "fft_radix_4_axis_1", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_5_axis_0", "common/fft.cl" },
-    { "fft_radix_5_axis_1", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_7_axis_0", "common/fft.cl" },
-    { "fft_radix_7_axis_1", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_8_axis_0", "common/fft.cl" },
-    { "fft_radix_8_axis_1", "common/fft.cl" },
-    { "fft_scale_conj", "common/fft_scale.cl" },
-    { "fill_image_borders_constant", "common/fill_border.cl" },
-    { "fill_image_borders_replicate", "common/fill_border.cl" },
-    { "floor_layer", "common/floor.cl" },
-    { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
-    { "gather", "common/gather.cl" },
-    { "gemm_ma_f16", "common/gemm.cl" },
-    { "gemm_ma_f32", "common/gemm.cl" },
-    { "gemm_mv", "common/gemv.cl" },
-    { "gemm_mv_quantized", "common/gemv.cl" },
-    { "gemm_mm_native", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_lc_vm_f32", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_native", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" },
-    { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
-    { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
-    { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
-    { "instance_normalization", "common/instance_normalization.cl" },
-    { "compute_mean_var", "common/instance_normalization.cl" },
-    { "l2_normalize_x", "common/l2_normalize.cl" },
-    { "l2_normalize_y", "common/l2_normalize.cl" },
-    { "l2_normalize_z", "common/l2_normalize.cl" },
-    { "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_nt_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_nt_t", "common/mat_mul.cl" },
-    { "mat_mul_native_t_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_t_t", "common/mat_mul.cl" },
-    { "mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl" },
-    { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
-    { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
-    { "memset", "common/memset.cl" },
-    { "minmax_layer", "common/minmax_layer.cl" },
-    { "non_max_suppression", "common/nonmax.cl" },
-    { "pad_layer_constant", "common/pad_layer.cl" },
-    { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
-    { "permute", "common/permute.cl" },
-    { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
-    { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
-    { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
-    { "quantization_layer", "common/quantization_layer.cl" },
-    { "range", "common/range.cl" },
-    { "range_quantized", "common/range.cl" },
-    { "reduction_operation_x", "common/reduction_operation.cl" },
-    { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
-    { "reduction_operation_y", "common/reduction_operation.cl" },
-    { "reduction_operation_z", "common/reduction_operation.cl" },
-    { "reduction_operation_w", "common/reduction_operation.cl" },
-    { "reshape_layer", "common/reshape_layer.cl" },
-    { "reshape_to_columns", "common/convolution_layer.cl" },
-    { "reverse", "common/reverse.cl" },
-    { "roi_align_layer", "common/roi_align_layer.cl" },
-    { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
-    { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
-    { "select_same_rank", "common/select.cl" },
-    { "select_different_rank_2", "common/select.cl" },
-    { "select_different_rank_n", "common/select.cl" },
-    { "softmax_layer_norm", "common/softmax_layer.cl" },
-    { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
-    { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
-    { "stack_layer", "common/stack_layer.cl" },
-    { "strided_slice", "common/slice_ops.cl" },
-    { "tile", "common/tile.cl" },
-    { "transpose", "common/transpose.cl" },
+    {"activation_layer", "common/activation_layer.cl"},
+    {"activation_layer_quant", "common/activation_layer_quant.cl"},
+    {"activation_layer_quant_f32", "common/activation_layer_quant.cl"},
+    {"arg_min_max_x", "common/arg_min_max.cl"},
+    {"arg_min_max_y", "common/arg_min_max.cl"},
+    {"arg_min_max_z", "common/arg_min_max.cl"},
+    {"arg_min_max_w", "common/arg_min_max.cl"},
+    {"bitwise_or", "common/bitwise_op.cl"},
+    {"bitwise_and", "common/bitwise_op.cl"},
+    {"bitwise_xor", "common/bitwise_op.cl"},
+    {"bitwise_not", "common/bitwise_op.cl"},
+    {"bounding_box_transform", "common/bounding_box_transform.cl"},
+    {"bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl"},
+    {"compare_equal", "common/comparisons.cl"},
+    {"compare_equal_quantized", "common/comparisons.cl"},
+    {"compare_notequal", "common/comparisons.cl"},
+    {"compare_notequal_quantized", "common/comparisons.cl"},
+    {"compare_greater", "common/comparisons.cl"},
+    {"compare_greater_quantized", "common/comparisons.cl"},
+    {"compare_greaterequal", "common/comparisons.cl"},
+    {"compare_greaterequal_quantized", "common/comparisons.cl"},
+    {"compare_less", "common/comparisons.cl"},
+    {"compare_less_quantized", "common/comparisons.cl"},
+    {"compare_lessequal", "common/comparisons.cl"},
+    {"compare_lessequal_quantized", "common/comparisons.cl"},
+    {"concatenate", "common/concatenate.cl"},
+    {"concatenate_width", "common/concatenate.cl"},
+    {"concatenate_height", "common/concatenate.cl"},
+    {"concatenate_width_x2", "common/concatenate.cl"},
+    {"concatenate_width_x4", "common/concatenate.cl"},
+    {"col2im", "common/col2im.cl"},
+    {"cast_down", "common/cast.cl"},
+    {"cast_up", "common/cast.cl"},
+    {"convert_fc_weights", "common/convert_fc_weights.cl"},
+    {"copy_tensor", "common/copy_tensor.cl"},
+    {"crop_tensor", "common/crop_tensor.cl"},
+    {"deconvolution_reshape", "common/deconvolution_layer.cl"},
+    {"deconvolution_upsample", "common/deconvolution_layer.cl"},
+    {"dequantization_layer", "common/dequantization_layer.cl"},
+    {"elementwise_operation_ADD", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SUB", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MAX", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MIN", "common/elementwise_operation.cl"},
+    {"elementwise_operation_DIV", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl"},
+    {"elementwise_operation_POWER", "common/elementwise_operation.cl"},
+    {"elementwise_operation_PRELU", "common/elementwise_operation.cl"},
+    {"elementwise_operation_AND", "common/elementwise_operation.cl"},
+    {"elementwise_operation_OR", "common/elementwise_operation.cl"},
+    {"elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_unary", "common/elementwise_unary.cl"},
+    {"elementwise_unary_quantized", "common/elementwise_unary_quantized.cl"},
+    {"fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl"},
+    {"fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl"},
+    {"fft_radix_2_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_2_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_2_axis_0", "common/fft.cl"},
+    {"fft_radix_2_axis_1", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_3_axis_0", "common/fft.cl"},
+    {"fft_radix_3_axis_1", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_4_axis_0", "common/fft.cl"},
+    {"fft_radix_4_axis_1", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_5_axis_0", "common/fft.cl"},
+    {"fft_radix_5_axis_1", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_7_axis_0", "common/fft.cl"},
+    {"fft_radix_7_axis_1", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_8_axis_0", "common/fft.cl"},
+    {"fft_radix_8_axis_1", "common/fft.cl"},
+    {"fft_scale_conj", "common/fft_scale.cl"},
+    {"fill_image_borders_constant", "common/fill_border.cl"},
+    {"fill_image_borders_replicate", "common/fill_border.cl"},
+    {"floor_layer", "common/floor.cl"},
+    {"fuse_batchnormalization_layer", "common/batchnormalization_layer.cl"},
+    {"gather", "common/gather.cl"},
+    {"gemm_ma_f16", "common/gemm.cl"},
+    {"gemm_ma_f32", "common/gemm.cl"},
+    {"gemm_mv", "common/gemv.cl"},
+    {"gemm_mv_quantized", "common/gemv.cl"},
+    {"gemm_mm_native", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_lc_vm_f32", "common/gemm.cl"},
+    {"gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemmlowp_matrix_a_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_b_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_native", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl"},
+    {"gemmlowp_offset_contribution", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl"},
+    {"generate_proposals_compute_all_anchors", "common/generate_proposals.cl"},
+    {"generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl"},
+    {"instance_normalization", "common/instance_normalization.cl"},
+    {"compute_mean_var", "common/instance_normalization.cl"},
+    {"l2_normalize_x", "common/l2_normalize.cl"},
+    {"l2_normalize_y", "common/l2_normalize.cl"},
+    {"l2_normalize_z", "common/l2_normalize.cl"},
+    {"mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_nt_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_nt_t", "common/mat_mul.cl"},
+    {"mat_mul_native_t_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_t_t", "common/mat_mul.cl"},
+    {"mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl"},
+    {"max_unpooling_layer_2", "common/unpooling_layer.cl"},
+    {"mean_stddev_normalization", "common/mean_stddev_normalization.cl"},
+    {"memset", "common/memset.cl"},
+    {"minmax_layer", "common/minmax_layer.cl"},
+    {"non_max_suppression", "common/nonmax.cl"},
+    {"pad_layer_constant", "common/pad_layer.cl"},
+    {"pad_layer_symmetric_reflect", "common/pad_layer.cl"},
+    {"permute", "common/permute.cl"},
+    {"pixelwise_mul_complex", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_float", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_int", "common/pixelwise_mul_int.cl"},
+    {"pixelwise_mul_quantized", "common/pixelwise_mul_int.cl"},
+    {"qlstm_layer_normalization", "common/qlstm_layer_normalization.cl"},
+    {"quantization_layer", "common/quantization_layer.cl"},
+    {"range", "common/range.cl"},
+    {"range_quantized", "common/range.cl"},
+    {"reduction_operation_x", "common/reduction_operation.cl"},
+    {"reduction_operation_non_parallel_x", "common/reduction_operation.cl"},
+    {"reduction_operation_y", "common/reduction_operation.cl"},
+    {"reduction_operation_z", "common/reduction_operation.cl"},
+    {"reduction_operation_w", "common/reduction_operation.cl"},
+    {"reshape_layer", "common/reshape_layer.cl"},
+    {"reshape_to_columns", "common/convolution_layer.cl"},
+    {"reverse", "common/reverse.cl"},
+    {"roi_align_layer", "common/roi_align_layer.cl"},
+    {"roi_align_layer_quantized", "common/roi_align_layer_quantized.cl"},
+    {"roi_pooling_layer", "common/roi_pooling_layer.cl"},
+    {"select_same_rank", "common/select.cl"},
+    {"select_different_rank_2", "common/select.cl"},
+    {"select_different_rank_n", "common/select.cl"},
+    {"softmax_layer_norm", "common/softmax_layer.cl"},
+    {"softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl"},
+    {"softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl"},
+    {"stack_layer", "common/stack_layer.cl"},
+    {"strided_slice", "common/slice_ops.cl"},
+    {"tile", "common/tile.cl"},
+    {"transpose", "common/transpose.cl"},
 #ifdef ENABLE_NCHW_KERNELS
-    { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
-    { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
-    { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
-    { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
-    { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
-    { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution_nchw", "nchw/direct_convolution.cl" },
+    {"batch_to_space_nchw", "nchw/batch_to_space.cl"},
+    {"batch_to_space_static_nchw", "nchw/batch_to_space.cl"},
+    {"batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl"},
+    {"channel_shuffle_nchw", "nchw/channel_shuffle.cl"},
+    {"depth_to_space_nchw", "nchw/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl"},
+    {"direct_convolution1x1", "nchw/direct_convolution1x1.cl"},
+    {"direct_convolution_nchw", "nchw/direct_convolution.cl"},
 
-    { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
-    { "im2col3x3_nchw", "nchw/im2col.cl" },
-    { "im2col5x5_nchw", "nchw/im2col.cl" },
-    { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" },
-    { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
-    { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
-    { "scale_bilinear_nchw", "nchw/scale.cl" },
-    { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
-    { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
-    { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
+    {"im2col1x1_stridex1_nchw", "nchw/im2col.cl"},
+    {"im2col3x3_nchw", "nchw/im2col.cl"},
+    {"im2col5x5_nchw", "nchw/im2col.cl"},
+    {"im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalization_layer_in_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nchw", "nchw/pooling_layer.cl"},
+    {"pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl"},
+    {"prior_box_layer_nchw", "nchw/prior_box_layer.cl"},
+    {"reorg_layer_nchw", "nchw/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nchw", "nchw/scale.cl"},
+    {"scale_bilinear_nchw", "nchw/scale.cl"},
+    {"space_to_batch_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_batch_static_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_depth_nchw", "nchw/space_to_depth.cl"},
+    {"upsample_layer_nchw", "nchw/upsample_layer.cl"},
+    {"winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl"},
 #endif /* ENABLE_NCHW_KERNELS */
 #ifdef ENABLE_NHWC_KERNELS
-    { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
-    { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
-    { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
-    { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
-    { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
-    { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
-    { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
-    { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
-    { "direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl" },
-    { "im2col3x3_nhwc", "nhwc/im2col.cl" },
-    { "im2col9x9_nhwc", "nhwc/im2col.cl" },
-    { "im2col_generic_nhwc", "nhwc/im2col.cl" },
-    { "indirect_convolution_nhwc", "nhwc/indirect_convolution.cl" },
-    { "indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl" },
-    { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
-    { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" },
-    { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" },
-    { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
-    { "scale_bilinear_nhwc", "nhwc/scale.cl" },
-    { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
-    { "transposed_convolution_nhwc", "nhwc/transposed_convolution.cl" },
-    { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
-    { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
+    {"batch_to_space_nhwc", "nhwc/batch_to_space.cl"},
+    {"batch_to_space_static_nhwc", "nhwc/batch_to_space.cl"},
+    {"batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl"},
+    {"channel_shuffle_nhwc", "nhwc/channel_shuffle.cl"},
+    {"depth_to_space_nhwc", "nhwc/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl"},
+    {"dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl"},
+    {"dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl"},
+    {"direct_convolution_nhwc", "nhwc/direct_convolution.cl"},
+    {"direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl"},
+    {"im2col3x3_nhwc", "nhwc/im2col.cl"},
+    {"im2col9x9_nhwc", "nhwc/im2col.cl"},
+    {"im2col_generic_nhwc", "nhwc/im2col.cl"},
+    {"indirect_convolution_nhwc", "nhwc/indirect_convolution.cl"},
+    {"indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl"},
+    {"normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl"},
+    {"pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl"},
+    {"pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl"},
+    {"reorg_layer_nhwc", "nhwc/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nhwc", "nhwc/scale.cl"},
+    {"scale_bilinear_nhwc", "nhwc/scale.cl"},
+    {"space_to_batch_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_batch_static_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_depth_nhwc", "nhwc/space_to_depth.cl"},
+    {"transposed_convolution_nhwc", "nhwc/transposed_convolution.cl"},
+    {"upsample_layer_nhwc", "nhwc/upsample_layer.cl"},
+    {"winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl"},
 #endif /* ENABLE_NHWC_KERNELS */
 };
 
-const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
         "activation_float_helpers.h",
@@ -996,7 +985,7 @@ std::string ClKernelLibrary::program_name(const std::string &kernel_name) const
     // Find which program contains the kernel
     auto kernel_program_it = _kernel_program_map.find(kernel_name);
 
-    if(_kernel_program_map.end() == kernel_program_it)
+    if (_kernel_program_map.end() == kernel_program_it)
     {
         ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
     }
@@ -1022,14 +1011,14 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
 #ifdef EMBEDDED_KERNELS
 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS
     const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
-    if(inflatted_program_source_it != _decompressed_source_map.end())
+    if (inflatted_program_source_it != _decompressed_source_map.end())
     {
-        return ClProgramInfo{ inflatted_program_source_it->second, false };
+        return ClProgramInfo{inflatted_program_source_it->second, false};
     }
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
     const auto program_source_it = _program_source_map.find(program_name);
-    if(program_source_it == _program_source_map.end())
+    if (program_source_it == _program_source_map.end())
     {
         ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
     }
@@ -1042,7 +1031,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
     program_source = std::move(decompressed_program_source);
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
-    return ClProgramInfo{ program_source, false };
+    return ClProgramInfo{program_source, false};
 #else  /* EMBEDDED_KERNELS */
     // Check for binary
     std::string source_name = _kernel_path + program_name;
@@ -1050,12 +1039,12 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
     std::string program_source{};
     bool        is_binary = false;
 
-    if(std::ifstream(binary_name).is_open())
+    if (std::ifstream(binary_name).is_open())
     {
         program_source = read_file(binary_name, true);
         is_binary      = true;
     }
-    else if(std::ifstream(source_name).is_open())
+    else if (std::ifstream(source_name).is_open())
     {
         program_source = read_file(source_name, false);
     }
@@ -1064,7 +1053,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
         ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
     }
 
-    return ClProgramInfo{ program_source, is_binary };
+    return ClProgramInfo{program_source, is_binary};
 #endif /* EMBEDDED_KERNELS */
 }
 } // namespace opencl
diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h
index 42bec95032..cd1d689199 100644
--- a/src/gpu/cl/ClKernelLibrary.h
+++ b/src/gpu/cl/ClKernelLibrary.h
@@ -52,8 +52,8 @@ public:
     /** Structure to encapsulte program related information */
     struct ClProgramInfo
     {
-        std::string program{};          /**< Program raw string */
-        bool        is_binary{ false }; /**< Flag that indicates if is in binary format */
+        std::string program{};        /**< Program raw string */
+        bool        is_binary{false}; /**< Flag that indicates if is in binary format */
     };
 
 public:
@@ -84,10 +84,12 @@ public:
     std::string program_name(const std::string &kernel_name) const;
 
 private:
-    std::string _kernel_path{};                                                 /**< Path to the kernels folder. */
-    mutable std::map<std::string, std::string>      _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
-    static const std::map<std::string, std::string> _kernel_program_map;        /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map;        /**< Contains sources for all programs.
+    std::string _kernel_path{}; /**< Path to the kernels folder. */
+    mutable std::map<std::string, std::string>
+        _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
+    static const std::map<std::string, std::string>
+        _kernel_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
                                                                                      Used for compile-time kernel inclusion. >*/
 };
 } // namespace opencl
diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp
index 2123adcf39..0cb7af5b61 100644
--- a/src/gpu/cl/ClQueue.cpp
+++ b/src/gpu/cl/ClQueue.cpp
@@ -36,7 +36,7 @@ namespace
 {
 CLTunerMode map_tuner_mode(AclTuningMode mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case AclRapid:
             return CLTunerMode::RAPID;
@@ -55,7 +55,7 @@ CLTunerMode map_tuner_mode(AclTuningMode mode)
 
 std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 {
-    if(options == nullptr || options->mode == AclTuningModeNone)
+    if (options == nullptr || options->mode == AclTuningModeNone)
     {
         return nullptr;
     }
@@ -68,8 +68,7 @@ std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 }
 } // namespace
 
-ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx), _tuner(nullptr)
+ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx), _tuner(nullptr)
 {
     _tuner = populate_tuner(options);
 }
diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h
index b16a0f4e83..09ffb06cf3 100644
--- a/src/gpu/cl/ClQueue.h
+++ b/src/gpu/cl/ClQueue.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLQUEUE_H
 #define SRC_GPU_CLQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/IQueue.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/src/gpu/cl/ClTensor.cpp b/src/gpu/cl/ClTensor.cpp
index 0df07813e3..27422a4130 100644
--- a/src/gpu/cl/ClTensor.cpp
+++ b/src/gpu/cl/ClTensor.cpp
@@ -31,8 +31,7 @@ namespace gpu
 {
 namespace opencl
 {
-ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::GpuOcl));
     _legacy_tensor = std::make_unique<CLTensor>();
@@ -43,7 +42,7 @@ void *ClTensor::map()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:map]: Backing tensor does not exist!");
         return nullptr;
@@ -57,7 +56,7 @@ StatusCode ClTensor::unmap()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:unmap]: Backing tensor does not exist!");
         return StatusCode::RuntimeError;
diff --git a/src/gpu/cl/ClTensor.h b/src/gpu/cl/ClTensor.h
index 99d228c0b8..70184cd4bd 100644
--- a/src/gpu/cl/ClTensor.h
+++ b/src/gpu/cl/ClTensor.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLTENSOR_H
 #define SRC_GPU_CLTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace gpu
@@ -54,7 +54,7 @@ public:
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<CLTensor> _legacy_tensor;
@@ -63,4 +63,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLTENSOR_H */
-\ No newline at end of file
+#endif /* SRC_GPU_CLTENSOR_H */
diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h
index 52ea3c9183..4f07e9ad68 100644
--- a/src/gpu/cl/IClKernel.h
+++ b/src/gpu/cl/IClKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ICL_KERNEL_H
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp
index ab1543729f..a85296f7cd 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.cpp
+++ b/src/gpu/cl/kernels/ClActivationKernel.cpp
@@ -28,14 +28,14 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 #include <set>
@@ -51,36 +51,47 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
-    {
-        ActivationLayerInfo::ActivationFunction::RELU,
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::LOGISTIC,
-        ActivationLayerInfo::ActivationFunction::TANH,
-        ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = {
+        ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+        ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
         ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
     };
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = act_info.activation();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
-                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) &&
+                                        (quantized_supported_activations.count(f_act) == 0),
+                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and "
+                                    "lower/upper bounded relu are supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
 
     // Checks performed when destination is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -95,15 +106,18 @@ ClActivationKernel::ClActivationKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info)
+void ClActivationKernel::configure(const ClCompileContext &compile_context,
+                                   ITensorInfo            *src,
+                                   ITensorInfo            *dst,
+                                   ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _run_in_place = (dst == nullptr) || (dst == src);
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // Destination auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -119,11 +133,10 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
 
     const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
     const bool                                    is_quantized = is_data_type_quantized(dt);
-    const bool                                    perform_activation_in_float =
-        (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-        || (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
+    const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::TANH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
 
     // Set build options
     CLBuildOptions build_opts;
@@ -132,22 +145,23 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
     build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
     std::string kernel_name = std::string("activation_layer");
 
     // Set quantization info build options
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
 
-        if(!perform_activation_in_float)
+        if (!perform_activation_in_float)
         {
             int a_const_int = 0;
             int b_const_int = 0;
 
             // Create quantized version of constants a, b if needed
-            switch(dt)
+            switch (dt)
             {
                 case DataType::QASYMM8:
                 {
@@ -180,22 +194,25 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
         }
 
         // Quantized value of 0 corresponds to the offset o1
-        build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
+        build_opts.add_option(
+            ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
         build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                 "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
 
         // Set correct kernel name
         kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
 
         // Set scale and offset of the source and destination if they have different quantization info
-        if(dst != nullptr)
+        if (dst != nullptr)
         {
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
 
-            if(iq_info != oq_info)
+            if (iq_info != oq_info)
             {
                 build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
-                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
+                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                         "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
             }
         }
     }
@@ -235,8 +252,9 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -246,13 +264,12 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h
index 82e35b6104..ab7607bb82 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.h
+++ b/src/gpu/cl/kernels/ClActivationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -51,7 +52,10 @@ public:
      * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]      act_info        Activation layer information.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   ActivationLayerInfo     act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivationKernel::configure()
@@ -64,7 +68,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    bool _run_in_place{ false };
+    bool _run_in_place{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
index 3d8ecf1fcc..a853f6bc1b 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -66,12 +66,15 @@ ClBatchConcatenateKernel::ClBatchConcatenateKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            batch_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _batch_offset = batch_offset;
 
@@ -81,8 +84,9 @@ void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -136,8 +140,9 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -152,9 +157,8 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
index f6b7c0ed09..549576b628 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
@@ -53,7 +53,8 @@ public:
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClBatchConcatenateKernel::configure()
@@ -66,7 +67,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    unsigned int _batch_offset{ 0 };
+    unsigned int _batch_offset{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index f621ad62d7..9ca35634f4 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -32,10 +32,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -52,20 +52,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32, DataType::S64, DataType::U64);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32, DataType::S64, DataType::U64);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
@@ -79,7 +76,10 @@ ClCastKernel::ClCastKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCastKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -88,7 +88,7 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Get data sizes
     const size_t src_size = data_size_from_type(src->data_type());
@@ -100,12 +100,14 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
     // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
     build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT");
+    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()),
+                             "-DIS_DATA_TYPE_FLOAT");
     build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
 
     // Create kernel
@@ -148,8 +150,9 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -162,8 +165,7 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
index a021b3c78c..07b0b61443 100644
--- a/src/gpu/cl/kernels/ClCastKernel.h
+++ b/src/gpu/cl/kernels/ClCastKernel.h
@@ -64,7 +64,8 @@ public:
      * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCastKernel::configure()
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
index 3316742912..9972e07f05 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.cpp
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,29 +48,38 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *src,
+                          const ITensorInfo *dst,
+                          const Size2D      &convolved_dims,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW,
+                                        "Col2Im output's data layout must always be NCHW");
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups))
+                                 .set_data_layout(DataLayout::NCHW));
 
     constexpr unsigned int num_elems_read_per_iteration = 8;
 
@@ -80,18 +90,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-ClCol2ImKernel::ClCol2ImKernel()
-    : _convolved_dims()
+ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+void ClCol2ImKernel::configure(const CLCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &convolved_dims,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -132,11 +146,15 @@ void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorI
     _config_id += support::cpp11::to_string(dst->dimension(1));
 }
 
-Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status ClCol2ImKernel::validate(const ITensorInfo *src,
+                                const ITensorInfo *dst,
+                                const Size2D      &convolved_dims,
+                                unsigned int       num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
     return Status{};
 }
 
@@ -168,8 +186,7 @@ void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
         add_3D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
+    } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h
index e19b7c8e16..34194aba01 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.h
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COL2IM_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,14 +69,19 @@ public:
      * @param[in]  convolved_dims  Output convolved dimensions.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &convolved_dims,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClCol2ImKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
index 716dec1f30..85d3c3939c 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,17 +46,21 @@ ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                     DataLayout data_layout)
+void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context,
+                                                     const ITensorInfo      *src,
+                                                     ITensorInfo            *dst,
+                                                     const TensorShape      &original_src_shape,
+                                                     DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*dst, *src->clone());
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
 
     const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
 
@@ -85,8 +90,10 @@ void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &com
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                      DataLayout data_layout)
+Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                      const ITensorInfo *dst,
+                                                      const TensorShape &original_src_shape,
+                                                      DataLayout         data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -96,7 +103,7 @@ Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, co
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -110,8 +117,9 @@ void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
index 16000e82f6..0ddb54561a 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
@@ -55,14 +55,21 @@ public:
      * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in]  data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp
index 4719448819..c80ef664f5 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.cpp
+++ b/src/gpu/cl/kernels/ClCopyKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,11 +51,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
     // Validate dst if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        if(dst_window == nullptr)
+        if (dst_window == nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
         }
@@ -74,12 +75,15 @@ ClCopyKernel::ClCopyKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
+void ClCopyKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -93,7 +97,7 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
 
     const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         _has_dst_window                = true;
         _dst_window                    = Window(*dst_window);
@@ -101,9 +105,11 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
         const int  vec_size_x_leftover = width_x % vec_size_x;
         const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
 
-        if(multi_access_x)
+        if (multi_access_x)
         {
-            _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
+            _dst_window.set(Window::DimX,
+                            Window::Dimension(dst_window->x().start(),
+                                              ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
         }
 
         build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
@@ -127,7 +133,8 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
+Status
+ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
 
@@ -139,12 +146,13 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice;
 
-    if(_has_dst_window)
+    if (_has_dst_window)
     {
         slice            = window.first_slice_window_3D();
         Window out_slice = _dst_window.first_slice_window_3D();
@@ -154,8 +162,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, out_slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
+        } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
     }
     else
     {
@@ -167,8 +174,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(collapsed.slide_window_slice_3D(slice));
+        } while (collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h
index 63fd806586..f915bf672d 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.h
+++ b/src/gpu/cl/kernels/ClCopyKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopyKernel::configure()
diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp
index 87ad6b49d9..0c503e13fc 100644
--- a/src/gpu/cl/kernels/ClCropKernel.cpp
+++ b/src/gpu/cl/kernels/ClCropKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,8 +47,14 @@ ClCropKernel::ClCropKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
-                             float extrapolation_value, Window *dst_window)
+void ClCropKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Coordinates2D           start,
+                             Coordinates2D           end,
+                             uint32_t                batch_index,
+                             float                   extrapolation_value,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
@@ -60,7 +67,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     // Create and update the window (if needed)
     Window win = calculate_max_window(*dst);
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
         win = *dst_window;
@@ -70,7 +77,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     const bool     multi_access_x = dst_width_x >= vec_size_x;
     const bool     remainder_x    = dst_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         win.set(Window::DimX,
                 Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
@@ -81,13 +88,21 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
     build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
     build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
     _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
 }
 
-Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCropKernel::validate(const ITensorInfo *src,
+                              const ITensorInfo *dst,
+                              Coordinates2D      start,
+                              Coordinates2D      end,
+                              uint32_t           batch_index,
+                              float              extrapolation_value,
+                              Window            *dst_window)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
@@ -95,14 +110,15 @@ Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Co
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2))
-                                || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) ||
+        end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
     ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
     }
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -116,12 +132,15 @@ void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window in_slice = Window();
     in_slice.use_tensor_dimensions(src->info()->tensor_shape());
-    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
+    in_slice.set(Window::DimX,
+                 Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()),
+                                   window.x().step()));
     in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
 
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h
index 2f166e184c..506262608c 100644
--- a/src/gpu/cl/kernels/ClCropKernel.h
+++ b/src/gpu/cl/kernels/ClCropKernel.h
@@ -53,16 +53,27 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCropKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
index a05cd1321e..ec44d88f01 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -48,7 +48,8 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
@@ -60,18 +61,20 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con
 }
 } // namespace
 
-ClDepthConcatenateKernel::ClDepthConcatenateKernel()
-    : _depth_offset(0)
+ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            depth_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _depth_offset = depth_offset;
 
@@ -81,8 +84,9 @@ void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -122,8 +126,9 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -138,8 +143,7 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
index 4739677f3b..539f010303 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
@@ -53,7 +53,8 @@ public:
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClDepthConcatenateKernel::configure()
diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
index 756cd56a8b..53429ab1aa 100644
--- a/src/gpu/cl/kernels/ClDequantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
@@ -34,7 +34,6 @@
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -49,9 +48,11 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
 
-    if(dst->tensor_shape().total_size() > 0)
+    if (dst->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -74,7 +75,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -87,7 +88,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
 
     // Create kernel
     CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
         const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
@@ -103,16 +104,18 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
 
     // Create kernel name
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst);
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -136,10 +139,11 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
 
     // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
+    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4)
+                                                 : window.collapse_if_possible(ICLKernel::window(), 3);
     Window slice      = new_window.first_slice_window_3D();
 
-    if(is_quantized_per_channel)
+    if (is_quantized_per_channel)
     {
         unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
         _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
@@ -151,8 +155,7 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
+    } while (new_window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 7ad398412a..7cf1958c1b 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
@@ -51,11 +52,17 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
 
     const DataLayout data_layout = src->data_layout();
@@ -63,41 +70,56 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true,
+                                    "Export to CLImage is not supported for the input tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true,
+                                    "Export to CLImage is not supported for the output tensor");
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) && std::get<0>(conv_info.stride()) > 2,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx),
+                                        "Weights should have same width and height");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
+                                        "Strides larger than 3 not supported for 1x1 convolution.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 ||
+                                         weights->dimension(width_idx) == 9) &&
+                                            std::get<0>(conv_info.stride()) > 2,
                                         "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout");
 
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5,
+                "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
         }
     }
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), "Fused activation in NHWC is only supported for floating point.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()),
+                                        "Fused activation in NHWC is only supported for floating point.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                        "M0 can only be greater than 0 and less than or equal to 8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                            desc.n0 != 16,
                                         "N0 can only be: 1, 2, 3, 4, 8, and 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                            desc.k0 != 16,
                                         "K0 can only be: 1, 2, 3, 4, 8, and 16");
-        if(desc.export_weights_to_cl_image)
+        if (desc.export_weights_to_cl_image)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
                                             "K0 can only be: 4, 8, and 16");
@@ -106,9 +128,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -118,20 +140,19 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     const auto data_type = src->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -140,7 +161,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
     return Status{};
 }
@@ -151,8 +173,14 @@ ClDirectConv2dKernel::ClDirectConv2dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClDirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                     ITensorInfo                       *src,
+                                     ITensorInfo                       *weights,
+                                     ITensorInfo                       *biases,
+                                     ITensorInfo                       *dst,
+                                     const PadStrideInfo               &conv_info,
+                                     const ActivationLayerInfo         &act_info,
+                                     const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -178,14 +206,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         output_shape.collapse(2U, 1U);
         const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
@@ -194,7 +219,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         // Create window and update padding
         win = calculate_max_window(output_shape, Steps(n0, m0));
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         _num_elems_processed_per_iteration = 1u;
         win                                = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -205,7 +230,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     std::stringstream kernel_name;
     CLBuildOptions    build_options;
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         kernel_name << "direct_convolution_nhwc";
 
@@ -221,22 +246,22 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         _export_output_to_cl_image  = desc.export_output_to_cl_image;
 
         // Update the padding for the weights tensor if we can export to cl_image
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             gemm::update_padding_for_cl_image(weights);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             gemm::update_padding_for_cl_image(dst);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             gemm::update_padding_for_cl_image(src);
         }
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             build_options.add_option(std::string("-DHAS_BIAS"));
             build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -246,9 +271,10 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         const auto act_function  = act_info.activation();
         const auto dst_data_type = dst->data_type();
 
-        if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-           && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-           && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+        if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+            (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+             act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+            (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
         {
             // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
             // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -259,7 +285,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
             build_options.add_option("-cl-fast-relaxed-math");
         }
 
-        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE",
+                                         "-DSRC_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
@@ -267,9 +294,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
         build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
         build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
-        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE",
+                                         "-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
-        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE",
+                                         "-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
@@ -284,7 +313,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
         build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -314,11 +343,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
             build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
 
-        if(compile_context.get_ddk_version() >= 30)
+        if (compile_context.get_ddk_version() >= 30)
         {
             build_options.add_option("-fregister-allocation=64");
         }
@@ -340,13 +371,17 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+        build_options.add_option(
+            std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-        build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
-        build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE_LEFTOVER=" +
+                        support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -405,8 +440,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClDirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                      const ITensorInfo                 *weights,
+                                      const ITensorInfo                 *biases,
+                                      const ITensorInfo                 *dst,
+                                      const PadStrideInfo               &conv_info,
+                                      const ActivationLayerInfo         &act_info,
+                                      const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
     return Status{};
@@ -420,52 +460,55 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         cl::Image2D weights_cl_image;
         cl::Image2D output_cl_image;
         cl::Image2D input_cl_image;
 
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             // Export tensor to cl_image
             weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             // Export tensor to cl_image
             output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             // Export tensor to cl_image
             input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly);
         }
 
         unsigned int idx = 0;
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             _kernel.setArg(idx++, input_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, src);
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             _kernel.setArg(idx++, output_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, dst);
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             _kernel.setArg(idx++, weights_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, weights);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, slice);
         }
@@ -476,7 +519,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
         unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
         add_3D_tensor_argument(idx1, weights, slice);
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             Window slice_biases;
             slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
@@ -491,8 +534,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice));
+        } while (window.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
index 7132762b35..c934c825ca 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,16 +69,27 @@ public:
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -85,9 +97,9 @@ public:
 public:
     DataLayout    _data_layout{};
     PadStrideInfo _conv_info{};
-    bool          _export_weights_to_cl_image{ false };
-    bool          _export_output_to_cl_image{ false };
-    bool          _export_input_to_cl_image{ false };
+    bool          _export_weights_to_cl_image{false};
+    bool          _export_output_to_cl_image{false};
+    bool          _export_input_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
index 6191178911..8002520a87 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -40,7 +41,11 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
@@ -49,20 +54,25 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) >
+                                (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) >
+                                (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) >
+                                (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
-        if(is_data_type_quantized(src0->data_type()))
+        if (is_data_type_quantized(src0->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
         }
@@ -70,15 +80,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
 
@@ -91,8 +104,12 @@ ClDirectConv3dKernel::ClDirectConv3dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst,
-                                     const Conv3dInfo &conv3d_info)
+void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *src0,
+                                     const ITensorInfo      *src1,
+                                     const ITensorInfo      *src2,
+                                     ITensorInfo            *dst,
+                                     const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -149,13 +166,13 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type())));
     }
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform();
@@ -218,7 +235,11 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     _config_id += support::cpp11::to_string(dst_channels);
 }
 
-Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      const ITensorInfo *dst,
+                                      const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info));
     return Status{};
@@ -229,21 +250,28 @@ void ClDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Get initial windows
     Window slice = window.first_slice_window_3D();
-    slice.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3), slice.y().step()), slice.y().step()));
+    slice.set(Window::DimY, Window::Dimension(0,
+                                              ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) *
+                                                                   dst->info()->dimension(3),
+                                                               slice.y().step()),
+                                              slice.y().step()));
     slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, slice);
     add_4D_tensor_argument(idx, dst, slice);
     add_4D_tensor_argument(idx, weights, slice);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
index de4f0ce216..cb7509d8fa 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
@@ -70,14 +70,23 @@ public:
      * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv3dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
index 6beee576b5..cdb3527a92 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -23,18 +23,20 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
+
 #include <map>
 
 namespace arm_compute
@@ -47,25 +49,20 @@ namespace
 {
 constexpr unsigned int vector_size_byte_opencl = 16;
 
-std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-    { ArithmeticOperation::DIV, "DIV" },
-    { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
-    { ArithmeticOperation::MIN, "MIN" },
-    { ArithmeticOperation::MAX, "MAX" },
-    { ArithmeticOperation::POWER, "POWER" },
-    { ArithmeticOperation::PRELU, "PRELU" },
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},     {ArithmeticOperation::SUB, "SUB"},
+    {ArithmeticOperation::DIV, "DIV"},     {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"},
+    {ArithmeticOperation::MIN, "MIN"},     {ArithmeticOperation::MAX, "MAX"},
+    {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"},
 };
 
-std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},
+    {ArithmeticOperation::SUB, "SUB"},
 };
 
-std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string
+generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
 {
     std::string config_id;
     // Set config_id for enabling LWS tuning
@@ -79,12 +76,18 @@ std::string generate_id_for_tuning_common(const std::string &kernel_name, const
     return config_id;
 }
 
-Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape)
+Status validate_in_place_output_shape(const bool         in_place,
+                                      const bool         src1_in_place,
+                                      const ITensorInfo &src1,
+                                      const ITensorInfo &src2,
+                                      const ITensorInfo &dst,
+                                      const TensorShape &out_shape)
 {
-    if(in_place)
+    if (in_place)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
-                                        "Wrong shape for dst, cannot do in_place calculation");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
+            "Wrong shape for dst, cannot do in_place calculation");
     }
     else
     {
@@ -94,7 +97,9 @@ Status validate_in_place_output_shape(const bool in_place, const bool src1_in_pl
     return Status{};
 }
 
-Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1,
+                                                          const ITensorInfo &src2,
+                                                          const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
@@ -110,11 +115,12 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
     }
 
     return Status{};
@@ -136,25 +142,27 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
     }
 
     return Status{};
 }
 
-Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status
+validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
 
-    if(is_data_type_quantized_symmetric(src1.data_type()))
+    if (is_data_type_quantized_symmetric(src1.data_type()))
     {
         const int32_t in1_offset = src1.quantization_info().uniform().offset;
         const int32_t in2_offset = src2.quantization_info().uniform().offset;
@@ -170,13 +178,15 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst");
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
 
-        if(is_data_type_quantized_symmetric(dst.data_type()))
+        if (is_data_type_quantized_symmetric(dst.data_type()))
         {
             const int32_t offset = dst.quantization_info().uniform().offset;
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
@@ -185,19 +195,26 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
     return Status{};
 }
 
-CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string)
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1,
+                                                            const ITensorInfo &src2,
+                                                            const ITensorInfo &dst,
+                                                            const std::string &operation_string)
 {
     CLBuildOptions build_opts;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN1=" +
+                          support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN2=" +
+                          support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DOP=" + operation_string);
-    if(is_data_type_quantized(src1.data_type()))
+    if (is_data_type_quantized(src1.data_type()))
     {
         const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
         const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
@@ -223,14 +240,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &s
 
 std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-    Window             win                               = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
     return std::make_pair(Status{}, win);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -238,9 +258,11 @@ std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     set_shape_if_empty(dst, out_shape);
@@ -249,9 +271,11 @@ std::pair<Status, Window> validate_and_configure_window_for_logical_binary_opera
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -265,21 +289,24 @@ ClElementwiseKernel::ClElementwiseKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context,
+                                           ITensorInfo            *src1,
+                                           ITensorInfo            *src2,
+                                           ITensorInfo            *dst)
 {
     // Configure kernel window
     auto win_config = validate_and_configure_window(*src1, *src2, *dst);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(src1->data_type()))
+    if (is_data_type_quantized(src1->data_type()))
     {
         kernel_name += "_quantized";
     }
 
     // Set kernel build options
     CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
-    if(_act_info.enabled())
+    if (_act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
@@ -299,9 +326,11 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -311,17 +340,18 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -337,7 +367,7 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_src1);
         add_3D_tensor_argument(idx, src_1, slice_src2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -345,13 +375,16 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
         enqueue(queue, *this, slice, lws_hint());
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 /** Logical binary */
 
-void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context,
+                                      LogicalOperation        op,
+                                      ITensorInfo            *src1,
+                                      ITensorInfo            *src2,
+                                      ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
@@ -359,7 +392,10 @@ void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, L
     configure_common(compile_context, src1, src2, dst);
 }
 
-Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status ClLogicalBinaryKernel::validate(LogicalOperation   op,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
@@ -369,14 +405,16 @@ Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *s
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone())
+            .first);
 
     return Status{};
 }
 
 std::string ClLogicalBinaryKernel::name()
 {
-    switch(_op)
+    switch (_op)
     {
         case LogicalOperation::And:
             return "AND";
@@ -390,30 +428,38 @@ std::string ClLogicalBinaryKernel::name()
     return "";
 }
 
-std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
     return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
 }
 
-CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     // The arithmetic utility functions can be share
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
 
-std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                          const ITensorInfo &src1,
+                                                          const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }
 
 /** Arithmetic operations with saturation*/
-void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+void ClSaturatedArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                            ArithmeticOperation        op,
+                                            ITensorInfo               *input1,
+                                            ITensorInfo               *input2,
+                                            ITensorInfo               *output,
                                             const ConvertPolicy       &policy,
                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     _policy   = policy;
     _op       = op;
@@ -422,24 +468,34 @@ void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_cont
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation        op,
+                                             const ITensorInfo         *input1,
+                                             const ITensorInfo         *input2,
+                                             const ITensorInfo         *output,
+                                             const ConvertPolicy       &policy,
                                              const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(op, policy);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone())
+            .first);
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
 
     return Status{};
 }
 
-std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1,
+                                                                                     ITensorInfo &input2,
+                                                                                     ITensorInfo &output)
 {
     return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
 }
 
-CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1,
+                                                                   const ITensorInfo &input2,
+                                                                   const ITensorInfo &output)
 {
     const bool has_float_out = is_data_type_float(output.data_type());
     auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
@@ -447,7 +503,9 @@ CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensor
     return build_options;
 }
 
-std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                                const ITensorInfo &input1,
+                                                                const ITensorInfo &output)
 {
     auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
     config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
@@ -461,12 +519,16 @@ std::string ClSaturatedArithmeticKernel::name()
 }
 
 /** Arithmetic operations*/
-void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+void ClArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                   ArithmeticOperation        op,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     _op       = op;
     _act_info = act_info;
@@ -474,33 +536,42 @@ void ClArithmeticKernel::configure(const ClCompileContext &compile_context, Arit
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClArithmeticKernel::validate(ArithmeticOperation        op,
+                                    const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    if(op == ArithmeticOperation::DIV)
+    if (op == ArithmeticOperation::DIV)
     {
         // Partial integer support S32/F32/F16
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
-    else if(op == ArithmeticOperation::POWER)
+    else if (op == ArithmeticOperation::POWER)
     {
         // Power operators doesn't support integer arithmetic
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
     else
     {
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone())
+                .first);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     return Status{};
 }
-std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
+    if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
     {
         // Division and Power operators don't support integer arithmetic
         return validate_and_configure_window_for_division(src1, src2, dst);
@@ -511,11 +582,14 @@ std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITen
     }
 }
 
-CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
-std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                       const ITensorInfo &src1,
+                                                       const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h
index ea3ddb2124..73e54542b2 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.h
@@ -25,8 +25,9 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/KernelTypes.h"
+
 #include "src/core/common/Macros.h"
+#include "src/core/KernelTypes.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,24 +66,28 @@ protected:
      *
      * @return a pair of Status and Window
      */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
+    virtual std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
 
     /** Generate the build options for the specific kernel
      *
      * @reutrn a CLBuildOptions struct
      */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
+    virtual CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
 
     /** Generate the identifier for tuning
      *
      * @reutrn a string
      */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
+    virtual std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
 
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void
+    configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
 
     ActivationLayerInfo _act_info{};
 };
@@ -100,23 +105,31 @@ public:
      * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void configure(const ClCompileContext &compile_context,
+                   LogicalOperation        op,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogicalBinaryKernel::configure()
      *
      * @return a status
      */
-    static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+    static Status
+    validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
 
 private:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
-
-    LogicalOperation _op{ LogicalOperation::Unknown };
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+
+    LogicalOperation _op{LogicalOperation::Unknown};
 };
 
 /** Addition operation */
@@ -135,7 +148,12 @@ public:
      * @param[in] policy          Policy to use to handle overflow.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *input1,
+                   ITensorInfo               *input2,
+                   ITensorInfo               *output,
+                   const ConvertPolicy       &policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -144,15 +162,23 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ConvertPolicy       &policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name,
+                                       const ITensorInfo &input1,
+                                       const ITensorInfo &output) override;
 
 private:
     ConvertPolicy       _policy{};
@@ -174,7 +200,11 @@ public:
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -183,14 +213,21 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
 
 private:
     ArithmeticOperation _op{};
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
index 744a3a40c7..f7c198ee54 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
@@ -23,11 +23,12 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,17 +47,18 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    if(op == ElementWiseUnary::LOGICAL_NOT)
+    if (op == ElementWiseUnary::LOGICAL_NOT)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
     }
-    else if(op == ElementWiseUnary::NEG)
+    else if (op == ElementWiseUnary::NEG)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
     }
-    else if(op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
+    else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED);
     }
     else
     {
@@ -64,7 +66,7 @@ Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const
     }
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
@@ -80,19 +82,23 @@ ClElementWiseUnaryKernel::ClElementWiseUnaryKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op)
+void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context,
+                                         const ITensorInfo      *src,
+                                         ITensorInfo            *dst,
+                                         const ElementWiseUnary &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
 
-    std::string kernel_name    = "elementwise_unary";
-    const int   vec_size_x     = num_elems_processed_per_iteration;
-    const int   dst_width_x    = dst->dimension(0);
-    if(is_data_type_quantized(src->data_type()))
+    std::string kernel_name = "elementwise_unary";
+    const int   vec_size_x  = num_elems_processed_per_iteration;
+    const int   dst_width_x = dst->dimension(0);
+    if (is_data_type_quantized(src->data_type()))
     {
         kernel_name += "_quantized";
     }
@@ -101,7 +107,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    if(is_data_type_quantized(src->data_type()))
+    if (is_data_type_quantized(src->data_type()))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -110,7 +116,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
         build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
     }
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             build_opts.add_option("-DOPERATION=rsqrt_op");
@@ -169,8 +175,9 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -178,8 +185,7 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
index 0f270f25e8..81721f8ca8 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  op              Element wise unary operation to perform.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const ElementWiseUnary &op);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementWiseUnaryKernel::configure()
diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp
index a9345ee334..96ad503730 100644
--- a/src/gpu/cl/kernels/ClFillKernel.cpp
+++ b/src/gpu/cl/kernels/ClFillKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,9 +48,10 @@ ClFillKernel::ClFillKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor,
-                             const PixelValue &constant_value,
-                             Window           *window)
+void ClFillKernel::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *tensor,
+                             const PixelValue       &constant_value,
+                             Window                 *window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
@@ -60,7 +62,7 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     // Create and update the window (if needed)
     _full_window = calculate_max_window(*tensor);
     Window win   = _full_window;
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
         win = *window;
@@ -70,9 +72,10 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     const bool multi_access_x = output_width_x >= vec_size_x;
     const bool remainder_x    = output_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -81,7 +84,9 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
     _kernel = create_kernel(compile_context, "memset", build_opts.options());
 }
 
@@ -89,7 +94,7 @@ Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &const
 {
     ARM_COMPUTE_UNUSED(tensor);
     ARM_COMPUTE_UNUSED(constant_value);
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
     }
@@ -101,7 +106,8 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     // Collapse all the batches on the third
     Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
@@ -112,8 +118,7 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h
index f25cf928ad..5d69fbfbd1 100644
--- a/src/gpu/cl/kernels/ClFillKernel.h
+++ b/src/gpu/cl/kernels/ClFillKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClFillKernel::configure()
diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp
index f9f834875a..358e84012b 100644
--- a/src/gpu/cl/kernels/ClFloorKernel.cpp
+++ b/src/gpu/cl/kernels/ClFloorKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -52,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
 
     // Validate in case of configured output
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -76,9 +77,9 @@ void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITe
 
     // Validate
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
     const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
     CLBuildOptions     build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
@@ -105,8 +106,9 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -117,8 +119,7 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
index accafeecc2..e0d925dfb2 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,26 +49,35 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
@@ -83,7 +91,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
+    if (gemm_info.reinterpret_input_as_3d())
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -92,9 +100,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
@@ -102,8 +111,13 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -115,17 +129,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_dst_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_dst_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_dst_as_3d)
     {
         reinterpret_dst_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(*dst, src0->clone()
+                                 ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))
+                                 .set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_dst_as_3d)
+    if (reinterpret_dst_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -138,12 +154,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // RHS matrix still needs padding on the X
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
 
     window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
 
@@ -153,7 +169,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -163,8 +180,13 @@ ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                                                     const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                     const ITensorInfo       *src0,
+                                                     ITensorInfo             *src1,
+                                                     ITensorInfo             *dst,
+                                                     const GEMMLHSMatrixInfo &lhs_info,
+                                                     const GEMMRHSMatrixInfo &rhs_info,
+                                                     const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -175,11 +197,11 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
     // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ src0, dst });
+    auto padding_info = get_padding_info({src0, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -192,7 +214,8 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -212,8 +235,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
@@ -258,19 +283,19 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                      const ITensorInfo       *src1,
+                                                      const ITensorInfo       *dst,
+                                                      const GEMMLHSMatrixInfo &lhs_info,
+                                                      const GEMMRHSMatrixInfo &rhs_info,
+                                                      const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -280,11 +305,13 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -296,7 +323,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -304,10 +331,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -317,7 +344,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -330,8 +357,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
index 4b328e0ab8..4f87096158 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -55,25 +56,34 @@ public:
      *                             rhs_info.k0: same as lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
index 15493f7ddc..ddbc809cdd 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -51,45 +50,55 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
     const int n = gemm_info.n();
     const int k = gemm_info.k();
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
@@ -99,19 +108,24 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                        const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info,
-                                                        ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -123,7 +137,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     // Configure kernel window
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -140,8 +155,13 @@ ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                       const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                       const ITensorInfo       *src0,
+                                                       const ITensorInfo       *src1,
+                                                       ITensorInfo             *dst,
+                                                       const GEMMLHSMatrixInfo &lhs_info,
+                                                       const GEMMRHSMatrixInfo &rhs_info,
+                                                       const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
@@ -154,11 +174,12 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     const unsigned int num_dimensionssrc0 = src0->num_dimensions();
     _slide_matrix_b                       = (src1->num_dimensions() >= num_dimensionssrc0);
 
-    auto              padding_info = get_padding_info({ src0, src1, dst });
+    auto              padding_info = get_padding_info({src0, src1, dst});
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -171,8 +192,10 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
@@ -230,19 +253,19 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        const ITensorInfo       *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -252,11 +275,13 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -268,7 +293,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
@@ -281,7 +306,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -295,8 +320,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
index a16f500f11..d7b785996f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -64,25 +65,34 @@ public:
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    unsigned int _k{ 1 };
-    bool         _use_dummy_work_items{ false };
+    bool         _slide_matrix_b{true};
+    bool         _reinterpret_output_as_3d{false};
+    unsigned int _k{1};
+    bool         _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
index 5d552b8d63..2f1f3b8df0 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -54,45 +53,57 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -103,11 +114,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -117,39 +128,42 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -161,30 +175,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -194,9 +210,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -211,16 +234,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -229,7 +253,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -242,12 +266,14 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
     num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
@@ -255,17 +281,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -275,7 +303,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -285,15 +314,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedO
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                              ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
+                                                              const ITensorInfo      *src0,
+                                                              const ITensorInfo      *src1,
+                                                              ITensorInfo            *dst,
+                                                              const GEMMKernelInfo   &gemm_info,
+                                                              ITensorInfo            *vector_sum_col,
+                                                              const ITensorInfo      *vector_sum_row,
+                                                              ITensorInfo            *bias,
+                                                              ITensorInfo            *output_multipliers,
+                                                              ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -307,7 +343,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -320,7 +356,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -341,8 +378,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
@@ -361,12 +400,12 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         kernel_name += "_fused_output_stage_fixedpoint";
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -377,9 +416,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
         build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
         build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
         // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro.
-        if(!_is_quantized_per_channel)
+        if (!_is_quantized_per_channel)
         {
-            build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+            build_opts.add_option("-DRESULT_MULTIPLIER=" +
+                                  support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
             build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
         }
         else
@@ -432,42 +472,56 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo    *src0,
+                                                               const ITensorInfo    *src1,
+                                                               const ITensorInfo    *dst,
+                                                               const GEMMKernelInfo &gemm_info,
+                                                               const ITensorInfo    *vector_sum_col,
+                                                               const ITensorInfo    *vector_sum_row,
+                                                               const ITensorInfo    *bias,
+                                                               const ITensorInfo    *output_multipliers,
+                                                               const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(src1->info()->num_dimensions() < 3)
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -479,7 +533,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -487,10 +541,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -515,7 +569,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -527,19 +581,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
+        if (_reinterpret_input_as_3d)
         {
             // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
             add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
@@ -548,8 +602,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
             add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         }
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
index a77604db7c..1d4696b089 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -70,31 +71,44 @@ public:
      * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _is_quantized_per_channel{ false };
-    bool _fuse_output_stage{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
+    bool _is_quantized_per_channel{false};
+    bool _fuse_output_stage{false};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 792c71da76..030c11d069 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -23,16 +23,15 @@
  */
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -47,39 +46,51 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), "Only 1,2,4 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), "Only 1,4,8 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4),
+                                    "Only 1,2,4 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8),
+                                    "Only 1,4,8 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -90,11 +101,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -104,38 +115,41 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -147,30 +161,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -180,9 +196,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -200,9 +223,10 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     reinterpret_output_as_3d = false;
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -211,7 +235,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -224,11 +248,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = 1;
     num_elems_processed_per_iteration_y = 1;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
@@ -236,17 +261,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -278,7 +305,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     collapsed.set(Window::DimX, x_dimension);
     collapsed.set(Window::DimY, y_dimension);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -288,15 +316,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyResha
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                                  const GEMMKernelInfo &gemm_info,
-                                                                  ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                                  ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context,
+                                                                  const ITensorInfo      *src0,
+                                                                  const ITensorInfo      *src1,
+                                                                  ITensorInfo            *dst,
+                                                                  const GEMMKernelInfo   &gemm_info,
+                                                                  ITensorInfo            *vector_sum_col,
+                                                                  const ITensorInfo      *vector_sum_row,
+                                                                  ITensorInfo            *bias,
+                                                                  ITensorInfo            *output_multipliers,
+                                                                  ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -313,7 +348,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -334,18 +370,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul");
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT");
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -396,42 +433,54 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                                   const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo    *src0,
+                                                                   const ITensorInfo    *src1,
+                                                                   const ITensorInfo    *dst,
+                                                                   const GEMMKernelInfo &gemm_info,
+                                                                   const ITensorInfo    *vector_sum_col,
+                                                                   const ITensorInfo    *vector_sum_row,
+                                                                   const ITensorInfo    *bias,
+                                                                   const ITensorInfo    *output_multipliers,
+                                                                   const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                               const Window     &window,
+                                                               cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    auto       dst            = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -449,7 +498,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(src2 != nullptr)
+        if (src2 != nullptr)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -461,21 +510,20 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens
         _kernel.setArg<cl_int>(idx++, _n);
         _kernel.setArg<cl_int>(idx++, _k);
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
-            if(vector_sum_col != nullptr)
+            if (vector_sum_col != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_col);
             }
-            if(vector_sum_row != nullptr)
+            if (vector_sum_row != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_row);
             }
         }
 
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 0ae549cd53..fc8b73140d 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,29 +66,42 @@ public:
      * @param[in]  output_multipliers (Optional) Output multipliers tensor. Supported data types: S32.
      * @param[in]  output_shifts      (Optional) Output shifts tensor. Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _fuse_output_stage{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _fuse_output_stage{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
index 9ec0b5182f..d93dbde95a 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
@@ -28,11 +28,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -44,12 +43,16 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          const ITensorInfo *bias,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -57,26 +60,28 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -87,13 +92,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
@@ -108,29 +115,34 @@ ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel()
 }
 
 void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                   int32_t k, int32_t a_offset, int32_t b_offset)
+                                                   const ITensorInfo      *mm_result,
+                                                   const ITensorInfo      *vector_sum_col,
+                                                   const ITensorInfo      *vector_sum_row,
+                                                   const ITensorInfo      *bias,
+                                                   int32_t                 k,
+                                                   int32_t                 a_offset,
+                                                   int32_t                 b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
+    auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias});
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -138,8 +150,10 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     std::string kernel_name("gemmlowp_offset_contribution");
@@ -165,10 +179,15 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
+Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                    const ITensorInfo *vector_sum_col,
+                                                    const ITensorInfo *vector_sum_row,
+                                                    const ITensorInfo *bias,
+                                                    int32_t            a_offset,
+                                                    int32_t            b_offset)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
     return Status{};
 }
 
@@ -177,10 +196,13 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
 
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto bias           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto mm_result      = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -209,8 +231,7 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind
         add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
index 48926e280b..2080a3a091 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
@@ -67,15 +67,25 @@ public:
      * @param[in]      b_offset        Offset to be added to each element of the matrix B.
      */
     void configure(const CLCompileContext &compile_context,
-                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                   int32_t k, int32_t a_offset, int32_t b_offset);
+                   const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   int32_t                 k,
+                   int32_t                 a_offset,
+                   int32_t                 b_offset);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           const ITensorInfo *bias,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
index c5fb54f524..26f479f61a 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -34,7 +34,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,12 +45,20 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo             *mm_result,
+                          const ITensorInfo             *vector_sum_col,
+                          const ITensorInfo             *vector_sum_row,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          int32_t                        a_offset,
+                          int32_t                        b_offset,
+                          const GEMMLowpOutputStageInfo &output_stage,
+                          const ITensorInfo             *output_multipliers,
+                          const ITensorInfo             *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -62,33 +69,35 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
+    if (output_stage.is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -99,20 +108,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
     // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -120,7 +131,8 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(),
+                                    "per channel quantization info is incorrect");
 
     return Status{};
 }
@@ -131,16 +143,26 @@ ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutpu
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context,
-                                                              const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *mm_result,
+                                                              const ITensorInfo             *vector_sum_col,
+                                                              const ITensorInfo             *vector_sum_row,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
+                                                              int32_t                        k,
+                                                              int32_t                        a_offset,
+                                                              int32_t                        b_offset,
+                                                              const GEMMLowpOutputStageInfo &output_stage,
+                                                              const ITensorInfo             *output_multipliers,
+                                                              const ITensorInfo             *output_shifts)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                  b_offset, output_stage, output_multipliers, output_shifts));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts });
+    auto padding_info =
+        get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts});
 
     const int min = output_stage.gemmlowp_min_bound;
     const int max = output_stage.gemmlowp_max_bound;
@@ -148,9 +170,8 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     _is_quantized_per_channel = output_stage.is_quantized_per_channel;
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     // Auto initialize the output
     auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
@@ -160,10 +181,11 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -171,8 +193,10 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
@@ -210,26 +234,42 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo             *mm_result,
+                                                               const ITensorInfo             *vector_sum_col,
+                                                               const ITensorInfo             *vector_sum_row,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
+                                                               int32_t                        a_offset,
+                                                               int32_t                        b_offset,
+                                                               const GEMMLowpOutputStageInfo &output_stage,
+                                                               const ITensorInfo             *output_multipliers,
+                                                               const ITensorInfo             *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                   b_offset, output_stage, output_multipliers, output_shifts));
     return Status{};
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto mm_result          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto mm_result =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -260,8 +300,7 @@ void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors,
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
index cee04473c4..97ee9bc97f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
@@ -66,23 +66,40 @@ public:
      * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                   int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *mm_result,
+                   const ITensorInfo             *vector_sum_col,
+                   const ITensorInfo             *vector_sum_row,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   int32_t                        k,
+                   int32_t                        a_offset,
+                   int32_t                        b_offset,
+                   const GEMMLowpOutputStageInfo &output_stage,
+                   const ITensorInfo             *output_multipliers,
+                   const ITensorInfo             *output_shifts);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    static Status validate(const ITensorInfo             *mm_result,
+                           const ITensorInfo             *vector_sum_col,
+                           const ITensorInfo             *vector_sum_row,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           int32_t                        a_offset,
+                           int32_t                        b_offset,
+                           const GEMMLowpOutputStageInfo &output_stage,
+                           const ITensorInfo             *output_multipliers,
+                           const ITensorInfo             *output_shifts);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _is_quantized_per_channel{ false };
+    bool _is_quantized_per_channel{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index 39754385a1..7b7beab12c 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,20 +46,23 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -75,7 +77,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32S
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo             *src,
+                                                                    const ITensorInfo             *bias,
+                                                                    const ITensorInfo             *dst,
                                                                     const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -84,14 +88,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITenso
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext        &compile_context,
+                                                                   const ITensorInfo             *src,
+                                                                   const ITensorInfo             *bias,
+                                                                   ITensorInfo                   *dst,
                                                                    const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -103,19 +110,26 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi
     auto           max = info->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(
+        (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if(
+        (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
+    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16)
+                                        ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16"
+                                        : "gemmlowp_output_stage_quantize_down_fixedpoint";
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -129,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                const Window     &window,
+                                                                cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create src window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -144,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -158,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
index 69b5fc5018..71c9f4b752 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
@@ -60,14 +60,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index f379698326..52ebd32d46 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,23 +46,31 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) &&
+                                (info->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) ||
+        info->gemmlowp_min_bound > info->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -78,7 +85,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleB
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo             *src,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
                                                                const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -87,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *src,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
                                                               const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -107,7 +119,8 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
     build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
@@ -130,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create input window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -145,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors,
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -159,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors,
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
index 8eda24d25f..057c66767f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -62,14 +62,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 5d54db214a..31434ce61b 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -26,15 +26,14 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,25 +45,34 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) &&
+                                (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type,
+                                        "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
 
@@ -77,7 +85,10 @@ ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                        const ITensorInfo             *bias,
+                                                        const ITensorInfo             *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
@@ -85,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src,
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext        &compile_context,
+                                                       const ITensorInfo             *src,
+                                                       const ITensorInfo             *bias,
+                                                       ITensorInfo                   *dst,
                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
@@ -104,13 +118,18 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c
     auto           max = output_stage->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
@@ -135,15 +154,17 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,8 +178,7 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
index 84c5060362..e6390801f1 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -62,14 +62,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  output_stage    GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *output_stage);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,4 +84,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
index ea88b485a0..ee4a191fed 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
@@ -32,7 +32,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,12 +46,15 @@ namespace
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
     }
     return Status{};
 }
@@ -60,12 +62,15 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITens
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
     }
     return Status{};
 }
@@ -76,7 +81,10 @@ IClGemmLowpReductionKernel::IClGemmLowpReductionKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_a,
+                                                 ITensorInfo                       *vector_sum_row,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
@@ -85,7 +93,7 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
+    auto padding_info = get_padding_info({mtx_a, vector_sum_row});
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -120,7 +128,9 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *mtx_a,
+                                                  const ITensorInfo                 *vector_sum_row,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
@@ -133,8 +143,9 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
     Window slice_in  = collapsed.first_slice_window_2D();
@@ -151,11 +162,13 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 
-void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_b,
+                                                 ITensorInfo                       *vector_sum_col,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -163,14 +176,15 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
+    auto padding_info = get_padding_info({mtx_b, vector_sum_col});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
     build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
@@ -192,7 +206,9 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *mtx_b,
+                                                  const ITensorInfo                 *vector_sum_col,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -205,8 +221,9 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -222,8 +239,7 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
index 7119b5fee0..c81543e4c2 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,10 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0;
+    virtual void configure(const CLCompileContext            &compile_context,
+                           const ITensorInfo                 *input,
+                           ITensorInfo                       *output,
+                           const GEMMLowpReductionKernelInfo &info) = 0;
 };
 
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
@@ -74,14 +78,18 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_a,
+                   ITensorInfo                       *vector_sum_row,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -106,14 +114,18 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_b,
+                   ITensorInfo                       *vector_sum_col,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index b8997dfc7f..fd23aa9924 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -51,7 +52,13 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -59,15 +66,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
 
@@ -82,7 +94,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -91,15 +103,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -107,9 +120,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -117,9 +131,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -132,17 +151,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -155,34 +175,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
+    AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
 
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
+        AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
                                        src2->dimension(1));
 
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
+        window_changed = update_window_and_padding(win, src0_access, src1_access,
+                                                   src2_access) || // window used by the execute_window_loop
+                         update_window_and_padding(
+                             win_out, dst_access); // window used to update the padding requirements of dst tensor
     }
     else
     {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
+        window_changed =
+            update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+            update_window_and_padding(win_out,
+                                      dst_access); // window used to update the padding requirements of dst tensor
     }
 
     // Collapse along the Z direction
@@ -191,7 +211,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -201,19 +222,26 @@ ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                 ITensorInfo             *src0,
+                                                 ITensorInfo             *src1,
+                                                 ITensorInfo             *src2,
+                                                 ITensorInfo             *dst,
+                                                 float                    alpha,
                                                  float                    beta,
                                                  const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                 const GEMMRHSMatrixInfo &rhs_info,
+                                                 const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, dst });
+    auto padding_info         = get_padding_info({src0, dst});
     _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
@@ -221,7 +249,7 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -234,7 +262,8 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info,
+                                                    rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -260,14 +289,17 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
@@ -275,9 +307,13 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_native");
 
@@ -314,21 +350,23 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                  const ITensorInfo       *src1,
+                                                  const ITensorInfo       *src2,
+                                                  const ITensorInfo       *dst,
+                                                  float                    alpha,
+                                                  float                    beta,
                                                   const GEMMLHSMatrixInfo &lhs_info,
-                                                  const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                  const GEMMRHSMatrixInfo &rhs_info,
+                                                  const GEMMKernelInfo    &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -338,15 +376,18 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -358,11 +399,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
             idx0 = 4 * num_arguments_per_2D_tensor() + 7;
         }
@@ -374,11 +415,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
             idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
         }
@@ -395,7 +436,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -403,7 +444,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, src0, slice);
         add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
+        if (_add_bias)
         {
             add_2D_tensor_argument(idx, src2, slice);
         }
@@ -411,7 +452,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
 
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
@@ -423,8 +464,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index 80f8355932..da6c9a5bb7 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -58,7 +59,13 @@ public:
      *                             rhs_info.k0: same of lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -68,7 +75,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -76,14 +89,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_input_as_3d{ false };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index d72d29ea1e..4fe6bddb36 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,7 +53,13 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -61,42 +68,50 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3),
+                                    "Only 2,3,4,8,16 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32),
+                                    "Mixed precision only supported for F16 data type");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -107,15 +122,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -123,9 +141,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -134,7 +157,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -147,7 +170,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -164,18 +188,26 @@ ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                   const ITensorInfo       *src0,
+                                                   const ITensorInfo       *src1,
+                                                   const ITensorInfo       *src2,
+                                                   ITensorInfo             *dst,
+                                                   float                    alpha,
+                                                   float                    beta,
+                                                   const GEMMLHSMatrixInfo &lhs_info,
+                                                   const GEMMRHSMatrixInfo &rhs_info,
+                                                   const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info         = get_padding_info({src0, src1, src2, dst});
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
@@ -188,14 +220,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0->clone().get(),
-                                                    src1->clone().get(),
-                                                    (src2 != nullptr) ? src2->clone().get() : nullptr,
-                                                    dst->clone().get(),
-                                                    lhs_info,
-                                                    rhs_info,
-                                                    gemm_info,
-                                                    num_elements_processed);
+    auto win_config = validate_and_configure_window(
+        src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+        lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -213,12 +240,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
@@ -229,7 +259,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
+    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision
+                                                            ? get_cl_type_from_data_type(DataType::F32)
+                                                            : get_cl_type_from_data_type(data_type)));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -237,9 +269,13 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
@@ -287,9 +323,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                    const ITensorInfo       *src1,
+                                                    const ITensorInfo       *src2,
+                                                    const ITensorInfo       *dst,
+                                                    float                    alpha,
+                                                    float                    beta,
                                                     const GEMMLHSMatrixInfo &lhs_info,
-                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
@@ -300,15 +342,18 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -324,12 +369,14 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -337,7 +384,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -348,7 +395,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -370,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
@@ -379,7 +426,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
 
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
         }
@@ -393,8 +440,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 8d25412a40..30928c4e1d 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -83,16 +83,29 @@ public:
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -100,14 +113,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index b34c17cda8..1b19f1ec5b 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,24 +47,36 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
@@ -71,19 +84,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -93,10 +107,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
 
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -106,9 +121,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -116,8 +132,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                     const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+Window validate_and_configure_window(ITensorInfo             *src0,
+                                     ITensorInfo             *src1,
+                                     ITensorInfo             *src2,
+                                     ITensorInfo             *dst,
+                                     const GEMMLHSMatrixInfo &lhs_info,
+                                     const GEMMRHSMatrixInfo &rhs_info,
+                                     const GEMMKernelInfo    &gemm_info,
+                                     ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -128,14 +150,14 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
     // This approach should only be used when the input/dst tensors have pad on the y direction
-    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
+    if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
     {
         reinterpret_output_as_3d = false;
     }
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -148,7 +170,8 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -164,14 +187,22 @@ ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKe
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
-                                                          const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext  &compile_context,
+                                                          const ITensorInfo       *src0,
+                                                          const ITensorInfo       *src1,
+                                                          const ITensorInfo       *src2,
+                                                          ITensorInfo             *dst,
+                                                          float                    alpha,
+                                                          float                    beta,
+                                                          const GEMMLHSMatrixInfo &lhs_info,
+                                                          const GEMMRHSMatrixInfo &rhs_info,
+                                                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
@@ -182,11 +213,11 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     _export_to_cl_image       = rhs_info.export_to_cl_image;
     _has_pad_y                = gemm_info.has_pad_y;
 
-    auto padding_info = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info = get_padding_info({src0, src1, src2, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
+    if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -199,8 +230,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), lhs_info, rhs_info, gemm_info,
-                                               num_elements_processed);
+    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                               (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+                                               lhs_info, rhs_info, gemm_info, num_elements_processed);
     ICLKernel::configure_internal(win);
 
     // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
@@ -225,7 +257,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -240,17 +273,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    if(_has_pad_y)
+    if (_has_pad_y)
     {
         build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
         build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     }
 
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
@@ -294,28 +333,39 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo       *src0,
+                                                           const ITensorInfo       *src1,
+                                                           const ITensorInfo       *src2,
+                                                           const ITensorInfo       *dst,
+                                                           float                    alpha,
+                                                           float                    beta,
                                                            const GEMMLHSMatrixInfo &lhs_info,
-                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                           const GEMMRHSMatrixInfo &rhs_info,
+                                                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                       const Window     &window,
+                                                       cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -341,12 +391,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -354,7 +406,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -365,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -387,22 +439,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
+            _kernel.setArg<cl_uint>(idx++,
+                                    static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
         }
 
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
 
         // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d && _has_pad_y)
+        if (_reinterpret_input_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
         }
 
         // Cross-plan padding (if reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d && _has_pad_y)
+        if (_reinterpret_output_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
         }
@@ -413,8 +466,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 471160c94b..e8fd78d476 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -74,32 +74,46 @@ public:
      *                             rhs_info.transpose: true,false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_input_as_3d{ false };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    bool       _has_pad_y{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    bool       _has_pad_y{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 734f8f9b4c..9a2a4890f3 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -56,23 +57,36 @@ constexpr int mmul_m0 = 4;
 constexpr int mmul_n0 = 4;
 constexpr int mmul_k0 = 4;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && rhs_info.n0 != 8 && rhs_info.n0 != 16, "Only 1,2,3,4,8, and 16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 &&
+                                        rhs_info.n0 != 8 && rhs_info.n0 != 16,
+                                    "Only 1,2,3,4,8, and 16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, "Only true is supported for interleave with mmul extension enabled");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, "Only false is supported for transpose with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true,
+                                    "Only true is supported for interleave with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false,
+                                    "Only false is supported for transpose with mmul extension enabled");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
@@ -87,7 +101,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
 
     // Validate the reinterpreted-as-3D-case
-    if(gemm_info.depth_output_gemm3d != 0)
+    if (gemm_info.depth_output_gemm3d != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -97,9 +111,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     }
 
     // Validate the gemm-batched case
-    if(src1->num_dimensions() > 2)
+    if (src1->num_dimensions() > 2)
     {
-        if(gemm_info.depth_output_gemm3d != 0)
+        if (gemm_info.depth_output_gemm3d != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2));
         }
@@ -109,15 +123,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -125,18 +140,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -144,7 +161,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
                                                         const GEMMKernelInfo    &gemm_info)
 {
@@ -152,11 +173,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -204,19 +226,26 @@ ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmMatrixMultiplyReshapedOnlyR
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext  &compile_context,
+                                                              ITensorInfo             *src0,
+                                                              ITensorInfo             *src1,
+                                                              ITensorInfo             *src2,
+                                                              ITensorInfo             *dst,
+                                                              float                    alpha,
                                                               float                    beta,
                                                               const GEMMLHSMatrixInfo &lhs_info,
-                                                              const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                              const GEMMRHSMatrixInfo &rhs_info,
+                                                              const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info   = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info   = get_padding_info({src0, src1, src2, dst});
     _add_bias           = src2 != nullptr;
     _export_to_cl_image = rhs_info.export_to_cl_image;
 
@@ -236,7 +265,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -249,7 +279,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
@@ -283,37 +314,44 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo       *src0,
+                                                               const ITensorInfo       *src1,
+                                                               const ITensorInfo       *src2,
+                                                               const ITensorInfo       *dst,
+                                                               float                    alpha,
+                                                               float                    beta,
                                                                const GEMMLHSMatrixInfo &lhs_info,
-                                                               const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                               const GEMMRHSMatrixInfo &rhs_info,
+                                                               const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info)
-                                .first);
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info)
+                                    .first);
 
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -321,12 +359,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     Window slice = window.first_slice_window_3D();
@@ -336,14 +376,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
         unsigned int idx = 0;
 
         add_3d_tensor_nhw_argument(idx, src0);
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -358,8 +398,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
         // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
         // LWS also enforces the order of execution of the workitems which improves cache utilization
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 59612fcf5d..86d3012f6e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,13 @@ public:
      *                             rhs_info.transpose: false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -69,7 +76,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -77,11 +90,11 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
index bf4b664b6e..eea2a169a3 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,13 +47,17 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status validate_arguments(const ITensorInfo       *src,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
     ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose);
@@ -60,10 +65,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -71,14 +77,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Window
+configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
 {
     const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
     const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
 
     TensorInfo tmp_info(*src);
 
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -88,10 +95,12 @@ Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixI
     }
 
     // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)));
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(
+                                 *src, lhs_info, reinterpret_input_as_3d)));
 
     // Configure window
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -106,14 +115,18 @@ ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMLHSMatrixInfo &lhs_info,
+                                             bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
 
-    auto padding_info = get_padding_info({ src });
+    auto padding_info = get_padding_info({src});
 
     const unsigned int src_w      = src->dimension(0);
     const unsigned int m          = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
@@ -168,7 +181,10 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
     return Status{};
@@ -179,8 +195,9 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -192,8 +209,7 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
index db88e0d735..8e84e8ad8e 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
@@ -57,14 +57,21 @@ public:
      *                                   lhs_info.interleave: true, false
      * @param[in]  reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   bool                     reinterpret_src_as_3d = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d);
+    static Status validate(const ITensorInfo       *src,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           bool                     reinterpret_src_as_3d);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -72,4 +79,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
index b3a03880ed..b9ce3873c7 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,8 +53,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)),
+                                    "Only 1,2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
@@ -61,15 +64,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type());
+        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1,
+                                              src->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -77,23 +82,27 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
 {
     const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
     const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
     bool               window_changed                      = false;
 
     // dst auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
 
     // Configure window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x,
+                                     num_elems_processed_per_iteration_y);
 
     window_changed = update_window_and_padding(win, src_access);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(dst);
     }
@@ -102,7 +111,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // This collapse needs to be here in order to tune the Z dimension of LWS
     Window collapsed = win.collapse(win, Window::DimZ);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -112,7 +122,10 @@ ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -143,7 +156,9 @@ void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_con
     _kernel.setArg<cl_int>(idx++, rhs_info.h0);
 }
 
-Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
@@ -156,8 +171,9 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -169,9 +185,8 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
index 31eaa46e02..7203d574fb 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
@@ -66,7 +66,10 @@ public:
      *                             rhs_info.transpose: true, false
      *                             rhs_info.interleave: true, false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMRHSMatrixInfo &rhs_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
@@ -81,4 +84,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
index 719201d1fe..2e1cefc6e7 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -62,8 +62,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
 }
 } // namespace
 
-ClHeightConcatenateKernel::ClHeightConcatenateKernel()
-    : _height_offset(0)
+ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -74,12 +73,15 @@ Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int
     return Status{};
 }
 
-void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                          ITensorInfo            *src,
+                                          unsigned int            height_offset,
+                                          ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _height_offset = height_offset;
 
@@ -90,9 +92,10 @@ void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -125,8 +128,9 @@ void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &windo
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
index d3c077fc22..5a391a1212 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
@@ -50,7 +50,8 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClHeightConcatenateKernel::configure()
@@ -64,7 +65,7 @@ public:
 
 private:
     unsigned int _height_offset;
-    int32_t      _depth{ 0 };
+    int32_t      _depth{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
index e890847199..ef7a52828f 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -60,13 +61,19 @@ struct Im2ColConfiguration
     bool                  is_padding_required_nchw{};
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *dst,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups)
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
@@ -82,9 +89,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     const unsigned     total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(
+            compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -93,13 +101,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo         *src,
+                                                        ITensorInfo         *dst,
+                                                        const Size2D        &kernel_dims,
+                                                        const PadStrideInfo &conv_info,
+                                                        bool                 has_bias,
+                                                        const Size2D        &dilation,
+                                                        unsigned int         num_elems_processed_per_iteration,
+                                                        bool                 is_padding_required_nchw,
+                                                        unsigned int         num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
+    TensorShape expected_output_shape =
+        compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
 
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape));
 
@@ -113,22 +129,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     bool   window_changed = false;
     Window win;
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
     }
     else
     {
-        if(is_padding_required_nchw)
+        if (is_padding_required_nchw)
         {
-            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-            win = calculate_max_window(*src,
-                                       Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
-            AccessWindowStatic input_access(src,
-                                            -border.left,
-                                            -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
-                                            input_height + border.bottom);
+            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(),
+                                    conv_info.pad_left());
+            win = calculate_max_window(
+                *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+            AccessWindowStatic input_access(
+                src, -border.left, -border.top,
+                ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+                input_height + border.bottom);
             window_changed = window_changed || update_window_and_padding(win, input_access);
         }
         else
@@ -142,11 +158,17 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
     win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo   *src,
+                                            const Size2D        &kernel_dims,
+                                            const PadStrideInfo &conv_info,
+                                            bool                 has_bias,
+                                            const Size2D        &dilation,
+                                            unsigned int         num_groups)
 {
     const DataLayout   data_layout   = src->data_layout();
     const DataType     data_type     = src->data_type();
@@ -157,7 +179,8 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     const unsigned int input_height  = src->dimension(height_idx);
     const unsigned int input_channel = src->dimension(channel_idx);
 
-    const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    const std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
 
     // Im2Col configuration
     std::string                   kernel_name = "im2col_generic_";
@@ -184,21 +207,22 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
     build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
+    build_opts.add_option_if_else(is_data_type_quantized(data_type),
+                                  "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         num_elems_processed_per_iteration = std::min(2U, input_channel);
         is_padding_required_nchw          = false;
 
         // Only the 3x3 and 9x9 cases are optimized for NHWC
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             kernel_name = "im2col3x3_";
             build_opts.add_option("-DIM2COL_3X3");
         }
-        else if(kernel_dims == Size2D(9U, 9U))
+        else if (kernel_dims == Size2D(9U, 9U))
         {
             kernel_name = "im2col9x9_";
             build_opts.add_option("-DIM2COL_9X9");
@@ -219,17 +243,17 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     }
     else
     {
-        if(dilation == Size2D(1U, 1U))
+        if (dilation == Size2D(1U, 1U))
         {
             const bool squared_im2col = kernel_dims.width == kernel_dims.height;
-            if(squared_im2col)
+            if (squared_im2col)
             {
                 // Check if we can run an optimized im2col for NCHW
-                switch(kernel_dims.width)
+                switch (kernel_dims.width)
                 {
                     case 1:
                         // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                        if (conv_info.stride().first == 1 && !conv_info.has_padding())
                         {
                             kernel_name                       = "im2col1x1_stridex1_";
                             num_elems_processed_per_iteration = 4;
@@ -248,7 +272,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
                         break;
                     case 11:
                         // Optimized im2col11x11 if pad_x = pad_y = 0
-                        if(!conv_info.has_padding())
+                        if (!conv_info.has_padding())
                         {
                             kernel_name                       = "im2col11x11_padx0_pady0_";
                             num_elems_processed_per_iteration = 1;
@@ -262,7 +286,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
                         break;
                 }
             }
-            else if(kernel_dims.width > 1 && !conv_info.has_padding())
+            else if (kernel_dims.width > 1 && !conv_info.has_padding())
             {
                 kernel_name                       = "im2col_generic_padx0_pady0_";
                 num_elems_processed_per_iteration = 1;
@@ -297,19 +321,29 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
 } // namespace
 
 ClIm2ColKernel::ClIm2ColKernel()
-    : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+    : _data_layout(DataLayout::UNKNOWN),
+      _convolved_dims(),
+      _num_elems_processed_per_iteration(1),
+      _kernel_dims(),
+      _conv_info(),
+      _num_groups()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                               const Size2D &dilation,
-                               unsigned int  num_groups)
+void ClIm2ColKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &kernel_dims,
+                               const PadStrideInfo    &conv_info,
+                               bool                    has_bias,
+                               const Size2D           &dilation,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
     _data_layout      = src->data_layout();
 
     const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
@@ -320,19 +354,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI
     // Select and configure the optimal OpenCL kernel to run.
     // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
     // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
 
     // Create kernel
     _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
 
-    _convolved_dims                    = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    _convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
     _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
     _kernel_dims                       = kernel_dims; // Only needed by the Tuner
     _conv_info                         = conv_info;   // Only needed by the Tuner
     _num_groups                        = num_groups;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation,
+                                                    im2col_config.num_elems_processed_per_iteration,
                                                     im2col_config.is_padding_required_nchw, num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
@@ -353,14 +390,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
-Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                unsigned int num_groups)
+Status ClIm2ColKernel::validate(const ITensorInfo   *src,
+                                const ITensorInfo   *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims,
+                                                              conv_info, has_bias, dilation,
+                                                              im2col_config.num_elems_processed_per_iteration,
                                                               im2col_config.is_padding_required_nchw, num_groups)
-                                .first);
+                                    .first);
     return Status{};
 }
 
@@ -388,7 +433,7 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     Window slice_in  = first_slice_3d;
     Window slice_out = window_output.first_slice_window_2D();
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
         const int    num_batches = tmp_win[3].end();
@@ -398,7 +443,10 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     }
     else
     {
-        slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
+        slice.set(0,
+                  Window::Dimension(
+                      0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)),
+                      _num_elems_processed_per_iteration));
         slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
         // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
     }
@@ -414,14 +462,16 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+    unsigned int idx = num_arguments_per_3D_tensor() +
+                       (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
     _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice_in);
-        if(_num_groups == 1)
+        if (_num_groups == 1)
         {
             add_2D_tensor_argument(idx, dst, slice_out);
         }
@@ -430,8 +480,8 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
             add_3D_tensor_argument(idx, dst, slice_out);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
+    } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) &&
+             window_collapsed.slide_window_slice_3D(slice_in));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h
index a637ad215d..c8cd5b328d 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.h
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -77,28 +78,38 @@ public:
      * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &kernel_dims,
+                   const PadStrideInfo    &conv_info,
+                   bool                    has_bias,
+                   const Size2D           &dilation   = Size2D(1U, 1U),
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *output,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation   = Size2D(1U, 1U),
+                           unsigned int         num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    DataLayout _data_layout;
+    DataLayout                            _data_layout;
     std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
+    unsigned int                          _num_elems_processed_per_iteration;
+    Size2D                                _kernel_dims;
+    PadStrideInfo                         _conv_info;
+    unsigned int                          _num_groups;
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
index d291fad76c..8c493d08c6 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,26 +44,29 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                                 src->data_layout(),
-                                                                                                                 weights->tensor_shape(),
-                                                                                                                 conv_info,
-                                                                                                                 desc));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                                  weights->tensor_shape(), conv_info, desc));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
 
@@ -75,8 +79,12 @@ ClIndirectConv2dAddressPrecalculationKernel::ClIndirectConv2dAddressPrecalculati
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                                                            const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext            &compile_context,
+                                                            ITensorInfo                       *src,
+                                                            ITensorInfo                       *weights,
+                                                            ITensorInfo                       *dst,
+                                                            const PadStrideInfo               &conv_info,
+                                                            const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc));
@@ -85,11 +93,8 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte
     constexpr unsigned int height_idx = 2;
 
     // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                     src->data_layout(),
-                                                                                     weights->tensor_shape(),
-                                                                                     conv_info,
-                                                                                     desc);
+    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -136,14 +141,19 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte
     // Since this kernel should be called only once, we do not need to set the config_id for tuning
 }
 
-Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                                                             const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo                 *src,
+                                                             const ITensorInfo                 *weights,
+                                                             const ITensorInfo                 *dst,
+                                                             const PadStrideInfo               &conv_info,
+                                                             const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc));
     return Status{};
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack      &tensors,
+                                                         const Window     &window,
+                                                         cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
index ff7f4be147..b565609c6a 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
@@ -60,16 +60,23 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
index a337eb50fd..3510b6970c 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
@@ -23,13 +23,14 @@
  */
 #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +47,14 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *indirect_buffer,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -55,37 +62,38 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(indirect_buffer->tensor_shape(),
-                                                       misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                             src->data_layout(),
-                                                                                                             weights->tensor_shape(),
-                                                                                                             conv_info,
-                                                                                                             desc));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        indirect_buffer->tensor_shape(),
+        misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                              weights->tensor_shape(), conv_info, desc));
 
     constexpr int channel_idx = 0;
     constexpr int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
                                     "N0 can only be: 1, 2, 3, 4, 8, and 16");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
                                     "K0 can only be: 1, 2, 3, 4, 8, and 16");
 
-    if(desc.export_weights_to_cl_image)
+    if (desc.export_weights_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
-                                        "K0 can only be: 4, 8, and 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
                                         "Export to CLImage is not supported for this weight configuration");
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -95,15 +103,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
@@ -116,13 +123,21 @@ ClIndirectConv2dKernel::ClIndirectConv2dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *indirect_buffer, ITensorInfo *dst,
-                                       const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                       ITensorInfo                       *src,
+                                       ITensorInfo                       *weights,
+                                       ITensorInfo                       *biases,
+                                       ITensorInfo                       *indirect_buffer,
+                                       ITensorInfo                       *dst,
+                                       const PadStrideInfo               &conv_info,
+                                       const ActivationLayerInfo         &act_info,
+                                       const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst);
 
     // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
 
     constexpr unsigned int channel_idx   = 0;
     constexpr unsigned int width_idx     = 1;
@@ -137,10 +152,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
@@ -164,7 +176,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     _export_to_cl_image = desc.export_weights_to_cl_image;
 
     // Update the padding for the weights tensor if we can export to cl_image
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(weights);
     }
@@ -173,11 +185,12 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer
     const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0;
     const unsigned int indirect_buf_width     = indirect_buffer->tensor_shape()[0];
-    const unsigned int round_up_width         = ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
-    const unsigned int padding                = round_up_width - indirect_buf_width;
+    const unsigned int round_up_width =
+        ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
+    const unsigned int padding = round_up_width - indirect_buf_width;
     indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0));
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -186,9 +199,10 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
     const auto act_function = act_info.activation();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (data_type == DataType::F32 || data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (data_type == DataType::F32 || data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -224,7 +238,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // A macro guard to compile ONLY the kernel of interest
     build_options.add_option("-D" + upper_string(kernel_name.str()));
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -253,10 +267,17 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
 }
 
-Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                                        const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                        const ITensorInfo                 *weights,
+                                        const ITensorInfo                 *biases,
+                                        const ITensorInfo                 *indirect_buffer,
+                                        const ITensorInfo                 *dst,
+                                        const PadStrideInfo               &conv_info,
+                                        const ActivationLayerInfo         &act_info,
+                                        const DirectConvComputeKernelInfo &desc)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
     return Status{};
 }
 
@@ -268,35 +289,42 @@ void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window,
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src             = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights         = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto indirect_buffer = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
-    auto       dst             = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto indirect_buffer =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     cl::Image2D weights_cl_image;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const size_t      image_w = weights->info()->dimension(0) / 4;
-        const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
+        const size_t image_w = weights->info()->dimension(0) / 4;
+        const size_t image_h =
+            weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
         const TensorShape shape2d(image_w, image_h);
         const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        weights_cl_image =
+            create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d,
+                                       weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, indirect_buffer);
     add_4d_tensor_nhwc_argument(idx, dst);
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
index b6c7b35fa4..04166d417e 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -60,22 +61,35 @@ public:
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *off, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *off,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *off, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *off,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    bool _export_to_cl_image{ false };
+    bool _export_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index 66331bc818..0bb6b0c083 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -29,17 +29,16 @@
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -62,51 +61,62 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo         *lhs,
+                                          const ITensorInfo         *rhs,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *dst,
+                                          const MatMulKernelInfo    &matmul_kernel_info,
                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && act_info.activation() != ActivationFunction::RELU
-                                     && act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && act_info.activation() != ActivationFunction::BOUNDED_RELU),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY &&
+                                     act_info.activation() != ActivationFunction::RELU &&
+                                     act_info.activation() != ActivationFunction::LU_BOUNDED_RELU &&
+                                     act_info.activation() != ActivationFunction::BOUNDED_RELU),
                                     "Activation Function specified is unsupported.");
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -115,7 +125,12 @@ Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorI
 
     return Status{};
 }
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *lhs,
+                                         ITensorInfo               *rhs,
+                                         ITensorInfo               *bias,
+                                         ITensorInfo               *dst,
+                                         const MatMulKernelInfo    &matmul_kernel_info,
                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -123,7 +138,8 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -217,10 +233,13 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -229,7 +248,7 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index 64415f42f7..ffdb720855 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -54,7 +55,12 @@ public:
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Class containing information about fused activation function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -62,7 +68,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
index 464212d7db..94e3c4e47b 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
@@ -28,10 +28,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -64,13 +64,15 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0");
@@ -84,7 +86,11 @@ ClMatMulLowpNativeMMULKernel::ClMatMulLowpNativeMMULKernel()
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo         *lhs,
+                                              const ITensorInfo         *rhs,
+                                              const ITensorInfo         *bias,
+                                              const ITensorInfo         *dst,
+                                              const MatMulKernelInfo    &matmul_kernel_info,
                                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
@@ -102,16 +108,17 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY),
                                     "Activation Function specified is unsupported.");
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -121,15 +128,21 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen
     return Status{};
 }
 
-void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst,
-                                             const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
+void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext    &compile_context,
+                                             ITensorInfo               *lhs,
+                                             ITensorInfo               *rhs,
+                                             ITensorInfo               *bias,
+                                             ITensorInfo               *dst,
+                                             const MatMulKernelInfo    &matmul_kernel_info,
+                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info);
     CLBuildOptions build_opts;
@@ -147,7 +160,8 @@ void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_con
     const unsigned int n0_leftover = n % n0;
 
     // Configure kernel window
-    const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -215,10 +229,13 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    auto       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
@@ -227,7 +244,7 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
index d2aa40b2e2..6c56f15d74 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -49,7 +50,12 @@ public:
      *
      * @return a status
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -57,7 +63,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 41ba5d5e25..a1fa9fa9ab 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -28,9 +28,9 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/CL/CLUtils.h"
@@ -38,7 +38,6 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -61,20 +60,23 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
@@ -83,30 +85,37 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
 Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings());
-    if(matmul_kernel_info.export_rhs_to_cl_image)
+    if (matmul_kernel_info.export_rhs_to_cl_image)
     {
-        if(matmul_kernel_info.adj_rhs)
+        if (matmul_kernel_info.adj_rhs)
         {
             const int k0 = matmul_kernel_info.k0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16,
+                                            "K0 can only be: 4, 8, and 16 for Rhs transposed");
         }
         else
         {
             const int n0 = matmul_kernel_info.n0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16,
+                                            "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs),
+                                        "Export to CLImage is not supported for this device/configuration");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulNativeKernel::ClMatMulNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulNativeKernel::validate(const ITensorInfo         *lhs,
+                                      const ITensorInfo         *rhs,
+                                      const ITensorInfo         *bias,
+                                      const ITensorInfo         *dst,
+                                      const MatMulKernelInfo    &matmul_kernel_info,
                                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -114,28 +123,36 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
     }
 
     return Status{};
 }
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulNativeKernel::configure(const ClCompileContext    &compile_context,
+                                     ITensorInfo               *lhs,
+                                     ITensorInfo               *rhs,
+                                     ITensorInfo               *bias,
+                                     ITensorInfo               *dst,
+                                     const MatMulKernelInfo    &matmul_kernel_info,
                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -143,7 +160,8 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -187,7 +205,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
 
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         gemm::update_padding_for_cl_image(rhs);
     }
@@ -222,10 +240,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -235,7 +256,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     add_3d_tensor_nhw_argument(idx, lhs);
 
     cl::Image2D rhs_cl_image;
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         const size_t      image_w = rhs->info()->dimension(0) / 4;
         const size_t      image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0);
@@ -243,12 +264,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
         const size_t      image_row_pitch = rhs->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d,
+                                                  rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         _kernel.setArg(idx++, rhs_cl_image);
     }
 
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index fe2b787c12..2cb150bc8f 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,12 @@ public:
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -60,14 +66,18 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _export_rhs_to_cl_image{ false };
+    bool _export_rhs_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
index 432270e8bf..76bf846e74 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
@@ -28,14 +28,13 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -62,31 +61,38 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0");
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeMMULKernel::validate(const ITensorInfo      *lhs,
+                                          const ITensorInfo      *rhs,
+                                          const ITensorInfo      *bias,
+                                          const ITensorInfo      *dst,
+                                          const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
 
@@ -96,32 +102,40 @@ Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorI
     const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
     ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias);
     }
 
     return Status{};
 }
-void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context,
+                                         ITensorInfo            *lhs,
+                                         ITensorInfo            *rhs,
+                                         ITensorInfo            *bias,
+                                         ITensorInfo            *dst,
+                                         const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int m = dst->dimension(1);
     const int n = dst->dimension(0);
@@ -135,7 +149,8 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context
     const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
 
     // Configure kernel window
-    const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -186,17 +201,20 @@ void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
     unsigned int idx = 0;
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
index 80448974c4..1aeb896325 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
@@ -72,22 +72,31 @@ public:
      * @param[out] dst             Output tensor info.
      * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *lhs,
+                   ITensorInfo            *rhs,
+                   ITensorInfo            *bias,
+                   ITensorInfo            *dst,
+                   const MatMulKernelInfo &matmul_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulNativeMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo      *lhs,
+                           const ITensorInfo      *rhs,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           const MatMulKernelInfo &matmul_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int _m{ 1 };
-    int _n{ 1 };
-    int _k{ 1 };
+    int _m{1};
+    int _n{1};
+    int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp
index 5ca0639852..3b59c2a7fc 100644
--- a/src/gpu/cl/kernels/ClMulKernel.cpp
+++ b/src/gpu/cl/kernels/ClMulKernel.cpp
@@ -23,15 +23,16 @@
  */
 #include "src/gpu/cl/kernels/ClMulKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,24 +47,25 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *src1,
+                          const ITensorInfo         *src2,
+                          const ITensorInfo         *dst,
+                          float                      scale,
+                          ConvertPolicy              overflow_policy,
+                          RoundingPolicy             rounding_policy,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
@@ -76,27 +78,35 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                             1,
-                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                             DataType::F16, DataType::S32, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 &&
+                                            (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
                                         "Dst can only be U8 if both src are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
-                                        "Dst can only be QASYMM8 if both src are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
-                                        "Dst can only be QSYMM16 if both src are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8 &&
+                (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
+            "Dst can only be QASYMM8 if both src are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8_SIGNED &&
+                (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
+            "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QSYMM16 &&
+                (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
+            "Dst can only be QSYMM16 if both src are QSYMM16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) &&
+                                            (dst->data_type() != DataType::S32),
                                         "Dst must be S32 if source tensors are S32");
-        if(in_place)
+        if (in_place)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
-                                            "Wrong shape for dst, cannot do in_place calculation");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                detail::have_different_dimensions(out_shape,
+                                                  src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+                "Wrong shape for dst, cannot do in_place calculation");
         }
         else
         {
@@ -114,14 +124,19 @@ ClMulKernel::ClMulKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMulKernel::configure(const CLCompileContext    &compile_context,
+                            ITensorInfo               *src1,
+                            ITensorInfo               *src2,
+                            ITensorInfo               *dst,
+                            float                      scale,
+                            ConvertPolicy              overflow_policy,
+                            RoundingPolicy             rounding_policy,
+                            const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
-                                                  scale, overflow_policy, rounding_policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
@@ -133,7 +148,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
     // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
     // Moreover, it will be negative as we deal with 1/2^n
-    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
     {
         // Store the positive exponent. We know that we compute 1/2^n
         // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
@@ -142,19 +157,19 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
 
     std::string acc_type;
     // Check if it has float src and dst
-    if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
+    if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
     {
         scale_int = -1;
         acc_type  = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
     }
     else
     {
-        if(src1->element_size() == 4 || src2->element_size() == 4)
+        if (src1->element_size() == 4 || src2->element_size() == 4)
         {
             // use 64 bit accumulator for 32-bit input
             acc_type = "long";
         }
-        else if(src1->element_size() == 2 || src2->element_size() == 2)
+        else if (src1->element_size() == 2 || src2->element_size() == 2)
         {
             // Use 32-bit accumulator for 16-bit input
             acc_type = "int";
@@ -176,11 +191,15 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    if(is_quantized && (dst->data_type() != DataType::S32))
+    if (is_quantized && (dst->data_type() != DataType::S32))
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -200,12 +219,14 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     else
     {
         kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE");
+        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()),
+                                      "-DWRAP", "-DSATURATE");
         build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
         build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
-            build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+            build_opts.add_option("-DACTIVATION_TYPE=" +
+                                  lower_string(string_from_activation_func(act_info.activation())));
             build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
             build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
@@ -223,7 +244,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     // Set scale argument
     unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
 
-    if(scale_int >= 0 && !is_quantized)
+    if (scale_int >= 0 && !is_quantized)
     {
         _kernel.setArg(idx++, scale_int);
     }
@@ -261,8 +282,13 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     _config_id += support::cpp11::to_string(dst->dimension(2));
 }
 
-Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                             ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMulKernel::validate(const ITensorInfo         *src1,
+                             const ITensorInfo         *src2,
+                             const ITensorInfo         *dst,
+                             float                      scale,
+                             ConvertPolicy              overflow_policy,
+                             RoundingPolicy             rounding_policy,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
@@ -275,9 +301,11 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -286,17 +314,18 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -312,7 +341,7 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_input1);
         add_3D_tensor_argument(idx, src_1, slice_input2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -320,15 +349,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 namespace
 {
 constexpr unsigned int vec_size_complex = 1;
 
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status validate_arguments_complex(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
@@ -340,11 +371,12 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
     }
 
     return Status{};
@@ -356,19 +388,23 @@ ClComplexMulKernel::ClComplexMulKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMulKernel::configure(const CLCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
@@ -384,7 +420,10 @@ void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITen
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMulKernel::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
@@ -397,26 +436,29 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     const TensorShape &in_shape1 = src_0->info()->tensor_shape();
     const TensorShape &in_shape2 = src_1->info()->tensor_shape();
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -435,8 +477,7 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h
index 4e62a6d67a..76a3ce02c1 100644
--- a/src/gpu/cl/kernels/ClMulKernel.h
+++ b/src/gpu/cl/kernels/ClMulKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_MUL_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -72,16 +73,27 @@ public:
      * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -101,14 +113,21 @@ public:
      * @param[out] dst             The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp
index 8d4655114b..a4755782ed 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.cpp
+++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp
@@ -29,8 +29,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,13 +61,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
                                     "Permutation up to 4-D src tensor is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
                                     "Permutation vector size should be less than or equal to 4");
-    for(const auto &p : perm)
+    for (const auto &p : perm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
     }
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -82,10 +83,13 @@ ClPermuteKernel::ClPermuteKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermuteKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       *src,
+                                ITensorInfo             *dst,
+                                const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    auto              padding_info = get_padding_info({ src, dst });
+    auto              padding_info = get_padding_info({src, dst});
     const TensorShape dst_shape    = get_dst_shape(src, perm);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
@@ -96,7 +100,8 @@ void ClPermuteKernel::configure(const CLCompileContext &compile_context, const I
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
     build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
     // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
     build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
@@ -126,8 +131,9 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
 
@@ -144,9 +150,8 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
         add_4D_tensor_argument(idx, src, slice_in);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_in, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+    } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h
index 0d349e739b..2413b10284 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.h
+++ b/src/gpu/cl/kernels/ClPermuteKernel.h
@@ -52,7 +52,10 @@ public:
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClPermuteKernel::configure()
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
index a1afc585e0..41ab4d6922 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,37 +44,47 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
-
-    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
-    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    int          output_width      = 0;
-    int          output_height     = 0;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
-
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
+        "Unsupported combination of parameters!");
+
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_global_pooling = pool_info.is_global_pooling;
+    unsigned int pool_size_x     = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y     = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    int          output_width    = 0;
+    int          output_height   = 0;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x,
+                                 pool_size_y, pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
 
     // Check indices
-    if(indices)
+    if (indices)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)),
+                                        "Pooling indices only supported for pool size 2x2");
 
-        if(indices->total_size() != 0)
+        if (indices->total_size() != 0)
         {
             TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
@@ -81,7 +92,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -98,42 +109,47 @@ ClPool2dKernel::ClPool2dKernel()
     _type = CLKernelType::POOL;
 }
 
-void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void ClPool2dKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const PoolingLayerInfo &pool_info,
+                               ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
 
-    auto padding_info = get_padding_info({ src, dst, indices });
+    auto padding_info = get_padding_info({src, dst, indices});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool_shape(*src, pool_info);
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
+    if (indices)
     {
         auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
     }
 
     // Set instance variables
-    _pool_info                         = pool_info;
-    _data_layout                       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    _num_elems_processed_per_iteration = (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
+    _pool_info   = pool_info;
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    _num_elems_processed_per_iteration =
+        (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
+    int               pool_stride_x  = 0;
+    int               pool_stride_y  = 0;
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
+    const bool          exclude_padding    = pool_info.exclude_padding;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int      pool_pad_top  = pad_stride_info.pad_top();
-    const int      pool_pad_left = pad_stride_info.pad_left();
-    const DataType data_type     = src->data_type();
+    const int      pool_pad_top            = pad_stride_info.pad_top();
+    const int      pool_pad_left           = pad_stride_info.pad_left();
+    const DataType data_type               = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -148,20 +164,23 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
     build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+    build_opts.add_option("-DMAX_WIDTH=" +
+                          support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" +
+                          support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
 
     // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+    if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+        is_data_type_float(data_type))
     {
         build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3)));
     }
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         build_opts.add_option("-DQUANTIZED");
 
-        if(src->quantization_info() != dst->quantization_info())
+        if (src->quantization_info() != dst->quantization_info())
         {
             const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -174,9 +193,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -184,7 +203,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
         }
         else
         {
-            std::string initial_value = pool_info.use_inf_as_limit ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+            std::string initial_value = pool_info.use_inf_as_limit
+                                            ? "(-INFINITY)"
+                                            : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
             build_opts.add_option("-DINITIAL_VALUE=" + initial_value);
         }
     }
@@ -195,22 +216,25 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     }
 
     // Create kernel
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
             const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
             const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
+            const auto acc_data_type          = get_cl_type_from_data_type(
+                         use_wider_accumulator ? DataType::F32
+                                               : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
             build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
             build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
 
-            if(pool_type != PoolingType::MAX)
+            if (pool_type != PoolingType::MAX)
             {
                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
             }
 
-            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+            if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+                is_data_type_float(data_type))
             {
                 // For max pooling with pool2x2, store indicies which will be used in max unpooling
                 std::string kernel_name = "pooling_layer_2_nchw_indices";
@@ -226,18 +250,19 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
         case DataLayout::NHWC:
         {
             // Floating point mixed precision is support on F16 only
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+            const auto use_fp_mixed_precision =
+                (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
             // Wider accumulation is required to avoid accuracy loss
             // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
             // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
             DataType acc_data_type = data_type;
 
-            if(use_fp_mixed_precision)
+            if (use_fp_mixed_precision)
             {
                 acc_data_type = DataType::F32;
             }
-            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
             {
                 acc_data_type = DataType::S32;
             }
@@ -250,8 +275,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
             build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
             build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
             build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                                  support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+            if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
             {
                 build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
 
@@ -260,7 +286,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
             }
             else
             {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type)
+                                              ? "pooling_layer_MxN_quantized_nhwc"
+                                              : "pooling_layer_MxN_nhwc";
                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             break;
@@ -290,7 +318,10 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status ClPool2dKernel::validate(const ITensorInfo      *src,
+                                const ITensorInfo      *dst,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
     return Status{};
@@ -301,18 +332,19 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    unsigned int pool_stride_x = 0;
-    unsigned int pool_stride_y = 0;
+    unsigned int pool_stride_x             = 0;
+    unsigned int pool_stride_y             = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
-    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
 
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -323,13 +355,12 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, src, slice);
                 add_3D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_3D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window_collapsed.slide_window_slice_3D(slice));
+            } while (window_collapsed.slide_window_slice_3D(slice));
             break;
         }
         case DataLayout::NHWC:
@@ -338,7 +369,8 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
 
             Window slice    = window_collapsed.first_slice_window_4D();
             Window in_slice = window_collapsed.first_slice_window_4D();
-            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
+            in_slice.set(Window::DimX,
+                         Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
             in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
             in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
             in_slice.set(3, Window::Dimension(0, batch_size, 1));
@@ -348,13 +380,13 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
                 unsigned int idx = 0;
                 add_4D_tensor_argument(idx, src, in_slice);
                 add_4D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) &&
+                    (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_4D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
+            } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
             break;
         }
         default:
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h
index f5bb0687e8..56b95a37d5 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.h
+++ b/src/gpu/cl/kernels/ClPool2dKernel.h
@@ -50,22 +50,29 @@ public:
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &pool_info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
     PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 1 };
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp
index d068832fed..a08c5d4be7 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,10 +51,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
-                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0),
+        "Strides cannot be zero.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
                                     "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto         data_layout       = src->data_layout();
@@ -68,17 +72,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     int                output_height     = 0;
     int                output_depth      = 0;
 
-    bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric.");
+    bool round_type_ceil_with_asymm_padding =
+        (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding,
+                                    "Cannot use dimension round type CEIL when padding is asymmetric.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                                      src->tensor_shape()[idx_depth], pool_size_x, pool_size_y,
-                                                                                      pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -95,11 +103,14 @@ ClPool3dKernel::ClPool3dKernel()
     _type = CLKernelType::POOL;
 }
 
-void ClPool3dKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+void ClPool3dKernel::configure(const ClCompileContext   &compile_context,
+                               const ITensorInfo        *src,
+                               ITensorInfo              *dst,
+                               const Pooling3dLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info);
@@ -112,23 +123,23 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4;
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    const PoolingType pool_type       = pool_info.pool_type;
-    const int         idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int         idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int         idx_depth       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
-    const int         idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int         idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int         pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int         pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const int         pool_size_z     = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
-    const bool        exclude_padding = pool_info.exclude_padding;
-    const int         pool_stride_x   = pool_info.stride.x();
-    const int         pool_stride_y   = pool_info.stride.y();
-    const int         pool_stride_z   = pool_info.stride.z();
-    const int         pool_pad_top    = pool_info.padding.top;
-    const int         pool_pad_left   = pool_info.padding.left;
-    const int         pool_pad_front  = pool_info.padding.front;
-    const DataType    data_type       = src->data_type();
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_depth      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int      pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const int      pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+    const bool     exclude_padding = pool_info.exclude_padding;
+    const int      pool_stride_x   = pool_info.stride.x();
+    const int      pool_stride_y   = pool_info.stride.y();
+    const int      pool_stride_z   = pool_info.stride.z();
+    const int      pool_pad_top    = pool_info.padding.top;
+    const int      pool_pad_left   = pool_info.padding.left;
+    const int      pool_pad_front  = pool_info.padding.front;
+    const DataType data_type       = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -149,7 +160,7 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth)));
 
     // If datatype is quantized add relevant parameters
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -161,9 +172,9 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -171,7 +182,8 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
         }
         else
         {
-            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+            build_opts.add_option("-DINITIAL_VALUE=" +
+                                  float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
         }
     }
     else
@@ -181,16 +193,18 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     }
     // Create kernel
     // Floating point mixed precision is support on F16 only
-    const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+    const auto use_fp_mixed_precision =
+        (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
     // Wider accumulation is required to avoid accuracy loss
     // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
     DataType acc_data_type = data_type;
-    if(use_fp_mixed_precision)
+    if (use_fp_mixed_precision)
     {
         acc_data_type = DataType::F32;
     }
-    else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
+    else if (is_data_type_quantized(data_type) &&
+             pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
     {
         acc_data_type = DataType::S32;
     }
@@ -202,11 +216,13 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth)));
     build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
     build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
 
     // if datatype is quantized use quantized kernel function
-    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc");
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized"
+                                                                            : "pooling_3d_layer_MxN_ndhwc");
+    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -240,8 +256,9 @@ void ClPool3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
 
     // Collapse 3D window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h
index 00852349e6..6cd229c427 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.h
+++ b/src/gpu/cl/kernels/ClPool3dKernel.h
@@ -50,7 +50,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  pool_info       Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3dKernel::configure()
@@ -64,8 +67,8 @@ public:
 
 private:
     Pooling3dLayerInfo _pool_info{};
-    DataLayout         _data_layout{ DataLayout::UNKNOWN };
-    unsigned int       _num_elems_processed_per_iteration{ 1 };
+    DataLayout         _data_layout{DataLayout::UNKNOWN};
+    unsigned int       _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
index 5c8bf97f0f..e8df420f67 100644
--- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,12 +49,14 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     // Output must always be initialized
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -71,7 +72,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -84,7 +85,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 
     float   scale_to_apply  = qinfo.scale;
     int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         /*
          * In case of requantization of a quantized input tensor to an output tensor with another quantization
@@ -132,8 +133,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
+    build_opts.add_option_if(
+        multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    std::pair<int, int> min_max_quant_values =
+        quantization::get_min_max_values_from_quantized_data_type(output_data_type);
     build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
     build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
 
@@ -141,9 +144,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -173,8 +177,7 @@ void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Co
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp
index 121bb33edf..53889f3a6b 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->tensor_shape().total_size() != 0)
+    if (dst->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -72,27 +73,17 @@ void ClReshapeKernel::configure(const CLCompileContext &compile_context, const I
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
-    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) };
+    std::set<std::string> build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())};
     _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
 
     // Add static arguments
-    const cl_int2 src_shape =
-    {
-        {
-            static_cast<cl_int>(src->tensor_shape()[0]),
-            static_cast<cl_int>(src->tensor_shape()[1])
-        }
-    };
-    const cl_int2 dst_shape =
-    {
-        {
-            static_cast<cl_int>(dst->tensor_shape()[0]),
-            static_cast<cl_int>(dst->tensor_shape()[1])
-        }
-    };
+    const cl_int2 src_shape = {
+        {static_cast<cl_int>(src->tensor_shape()[0]), static_cast<cl_int>(src->tensor_shape()[1])}};
+    const cl_int2 dst_shape = {
+        {static_cast<cl_int>(dst->tensor_shape()[0]), static_cast<cl_int>(dst->tensor_shape()[1])}};
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
     _kernel.setArg<cl_int2>(idx++, src_shape);
     _kernel.setArg<cl_int2>(idx++, dst_shape);
@@ -119,8 +110,9 @@ void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Set srcs
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h
index db6ab5da58..95eae82086 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClReshapeKernel.h
@@ -58,7 +58,7 @@ public:
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index 4c4373a215..4305acad26 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -27,8 +27,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,8 @@ namespace kernels
 {
 namespace
 {
-inline std::tuple<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
+inline std::tuple<float, float>
+calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
 {
     const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,20 +66,25 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info.align_corners &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) &&
+                                !is_data_type_quantized_asymmetric(src->data_type()));
 
     float            scale_x     = 0.f;
     float            scale_y     = 0.f;
     const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+    std::tie(scale_x, scale_y)   = calculate_scale_factors(src, dst, data_layout, info.align_corners);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA &&
+                                (scale_x > 1.f || scale_y > 1.f));
 
     return Status{};
 }
@@ -94,23 +101,26 @@ ClScaleKernel::ClScaleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScaleKernel::configure(const CLCompileContext &compile_context,
+                              ITensorInfo            *src,
+                              ITensorInfo            *dst,
+                              const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Info required for the static tuning
     _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
 
     const bool is_nhwc = _data_layout == DataLayout::NHWC;
 
-    float scale_x = 0.f;
-    float scale_y = 0.f;
+    float scale_x              = 0.f;
+    float scale_y              = 0.f;
     std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     auto interpolation_policy_to_use = info.interpolation_policy;
-    if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
+    if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
     {
         interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
@@ -127,7 +137,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     unsigned int       vec_size_leftover = 0;
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
         vec_size_leftover = dst_channels % vec_size;
@@ -135,7 +145,8 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
         build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
         build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
         build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
@@ -144,27 +155,33 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
         build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         vec_size          = adjust_vec_size(4, dst_width);
         vec_size_leftover = dst_width % vec_size;
         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
         build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
         build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
         build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0)
+                                                            ? support::cpp11::to_string(vec_size)
+                                                            : support::cpp11::to_string(vec_size_leftover)));
         build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
 
-        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-        if(is_qasymm_bilinear)
+        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) &&
+                                        info.interpolation_policy == InterpolationPolicy::BILINEAR;
+        if (is_qasymm_bilinear)
         {
             const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
             build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
@@ -190,7 +207,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Pass scale kernel arguments
-    if(is_nhwc)
+    if (is_nhwc)
     {
         unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
         _kernel.setArg<cl_float>(idx++, scale_x);
@@ -219,7 +236,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
     auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
     auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -231,8 +248,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
                 add_2D_tensor_argument(idx, src, slice);
                 add_2D_tensor_argument(idx, dst, slice);
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(slice));
+            } while (window.slide_window_slice_2D(slice));
             break;
         }
         case DataLayout::NHWC:
diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h
index dd09e92ee2..c09659017d 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.h
+++ b/src/gpu/cl/kernels/ClScaleKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -49,7 +50,8 @@ public:
      *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClScaleKernel::configure()
@@ -62,7 +64,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    DataLayout _data_layout{ DataLayout::UNKNOWN };
+    DataLayout _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
index 59299fa441..1b5a2666bc 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,15 +62,16 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float
     // Number of integer bits used in temporary fixed-point representation of exponent accumulator
     static const int exp_accumulation_in_bits = 12;
 
-    const double beta_multiplier = std::min(
-                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1LL << 31) - 1.0);
+    const double beta_multiplier =
+        std::min(1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), (1LL << 31) - 1.0);
     int input_beta_multiplier;
     int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
+    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier,
+                                                                  &input_beta_left_shift);
 
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
+    const double max_input_rescaled =
+        1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
+    const int diff_min = -1.f * std::floor(max_input_rescaled);
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
@@ -80,18 +83,22 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float
     return build_opts;
 }
 
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
+Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src,
+                                           const ITensorInfo &max,
+                                           const ITensorInfo &dst,
+                                           const ITensorInfo &sum)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
     // Checks performed when output is configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
-        if(is_quantized_asymmetric)
+        if (is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32);
         }
@@ -103,9 +110,9 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor
     }
 
     // Checks performed when sum is configured
-    if(sum.total_size() != 0)
+    if (sum.total_size() != 0)
     {
-        if(is_quantized_asymmetric)
+        if (is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32);
         }
@@ -119,7 +126,10 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor
     return Status{};
 }
 
-Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+Status validate_arguments_1DNorm(const ITensorInfo       &src,
+                                 const ITensorInfo       &sum,
+                                 const ITensorInfo       &dst,
+                                 const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32);
@@ -127,14 +137,15 @@ Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum,
     ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
 
     // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool             is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
+    const QuantizationInfo allowed_quantization_info =
+        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
 
     // Checks performed when output is configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        if(!is_quantized_asymmetric)
+        if (!is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
         }
@@ -161,9 +172,14 @@ ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info)
+void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext  &compile_context,
+                                               const ITensorInfo       &src,
+                                               ITensorInfo             &max,
+                                               ITensorInfo             &dst,
+                                               ITensorInfo             &sum,
+                                               const SoftmaxKernelInfo &info)
 {
-    auto padding_info = get_padding_info({ &src, &max, &dst, &sum });
+    auto padding_info = get_padding_info({&src, &max, &dst, &sum});
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape()));
@@ -191,15 +207,21 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
     build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
     build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
     build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f),
+                             "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
+    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX")
+                                                                                          : std::string("-FLT_MAX")));
+    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                             "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
+    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                             "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt),
+                              prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
 
     cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
+    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") +
+                              (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
 
     // Create kernel.
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -211,7 +233,10 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
+Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src,
+                                                const ITensorInfo &max,
+                                                const ITensorInfo &dst,
+                                                const ITensorInfo &sum)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
     return Status{};
@@ -241,7 +266,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &
 
     // Reconfigure window in case of parallel reduction
     ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0));
-    if(std::get<0>(parallel_reduction_info))
+    if (std::get<0>(parallel_reduction_info))
     {
         // Launch grid_size parallel work items
         window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
@@ -258,8 +283,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &
         add_3D_tensor_argument(idx, dst, slice);
         add_3D_tensor_argument(idx, sum, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 
 ClLogits1DNormKernel::ClLogits1DNormKernel()
@@ -267,18 +291,24 @@ ClLogits1DNormKernel::ClLogits1DNormKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+void ClLogits1DNormKernel::configure(const CLCompileContext  &compile_context,
+                                     const ITensorInfo       &src,
+                                     const ITensorInfo       &sum,
+                                     ITensorInfo             &dst,
+                                     const SoftmaxKernelInfo &info)
 {
-    auto padding_info = get_padding_info({ &src, &dst, &sum });
+    auto padding_info = get_padding_info({&src, &dst, &sum});
 
     // Note: output should always have a scale of 1/256 and offset 0
-    const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType                output_data_type          = info.input_data_type;
-    const QuantizationInfo        allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo                     = src.quantization_info().uniform();
+    const bool             is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
+    const DataType         output_data_type        = info.input_data_type;
+    const QuantizationInfo allowed_quantization_info =
+        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
+    const UniformQuantizationInfo qinfo = src.quantization_info().uniform();
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
+    auto_init_if_empty(dst,
+                       src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info));
@@ -311,7 +341,10 @@ void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+Status ClLogits1DNormKernel::validate(const ITensorInfo       &src,
+                                      const ITensorInfo       &sum,
+                                      const ITensorInfo       &dst,
+                                      const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info));
 
@@ -343,9 +376,8 @@ void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::
         add_3D_tensor_argument(idx, sum, sum_slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h
index a221e12132..2dd53da346 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.h
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -61,14 +62,20 @@ public:
      * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p src
      * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &max,
+                   ITensorInfo             &dst,
+                   ITensorInfo             &sum,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
+    static Status
+    validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
     /** Checks if the given size is eligible for parallel reduction
      *
      * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
@@ -100,14 +107,19 @@ public:
      * @param[out] dst             Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   const ITensorInfo       &sum,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogits1DNormKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    static Status
+    validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp
index 6450ffb5b2..6eb2bf81c0 100644
--- a/src/gpu/cl/kernels/ClTransposeKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -58,12 +59,12 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
 
     ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
     const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    const unsigned int vec_size_y           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
+    const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
     const int          vec_size_y_leftovers = src->dimension(1) % vec_size_y;
 
     CLBuildOptions build_opts;
@@ -89,9 +90,10 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported");
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
+        const TensorInfo dst_info =
+            src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -106,8 +108,9 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_2D();
 
@@ -117,9 +120,8 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C
         add_2D_tensor_argument(idx, src, slice);
         add_2D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
index ae825694c5..76f39ac500 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
@@ -26,14 +26,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -42,11 +42,15 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *output,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC);
@@ -56,12 +60,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     constexpr unsigned int height_idx  = 2;
     constexpr unsigned int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -77,15 +82,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const size_t input_width    = input->dimension(width_idx);
         const size_t input_height   = input->dimension(height_idx);
         const size_t weights_width  = weights->dimension(width_idx);
         const size_t weights_height = weights->dimension(height_idx);
 
-        auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
-        TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+        auto out_dims =
+            deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -96,8 +103,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 }
 } // namespace
 
-void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                              const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context,
+                                              const ITensorInfo      *input,
+                                              const ITensorInfo      *weights,
+                                              const ITensorInfo      *biases,
+                                              ITensorInfo            *output,
+                                              const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_UNUSED(biases, deconv_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -119,7 +130,8 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     const size_t output_channels = output->dimension(channel_idx);
 
     // Calculate output shape
-    auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
     TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
     auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info());
 
@@ -147,7 +159,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     const DataType    input_data_type = input->data_type();
     const PaddingInfo strides         = deconv_info.stride();
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -180,7 +192,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
     build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP");
 
-    if(is_data_type_quantized(output_data_type))
+    if (is_data_type_quantized(output_data_type))
     {
         const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -210,7 +222,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
         build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
     }
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -235,8 +247,11 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     _config_id += support::cpp11::to_string(n0);
 }
 
-Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases,
-                                               const ITensorInfo *dst, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolutionKernel::validate(const ITensorInfo   *src,
+                                               const ITensorInfo   *weights,
+                                               const ITensorInfo   *biases,
+                                               const ITensorInfo   *dst,
+                                               const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info));
     return Status{};
@@ -250,17 +265,20 @@ void ClTransposedConvolutionKernel::run_op(ITensorPack &tensors, const Window &w
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, dst);
 
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
index d4350dda50..44f6f56b7a 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
@@ -45,16 +45,23 @@ public:
      * Similar to @ref ClTransposedConvolution::configure()
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -63,4 +70,4 @@ public:
 } // namespace opencl
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
index 8f36345076..af80c4d796 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -39,7 +41,10 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,20 +53,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) &&
+                                    (biases->dimension(0) != input->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (input->num_dimensions() == 5) &&
+            (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -75,16 +84,21 @@ ClWeightsReshapeKernel::ClWeightsReshapeKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups)
+void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
+                                       const ITensorInfo      *src,
+                                       const ITensorInfo      *biases,
+                                       ITensorInfo            *dst,
+                                       unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups));
-    auto padding_info = get_padding_info({ src, biases, dst });
+    auto padding_info = get_padding_info({src, biases, dst});
 
     const DataType data_type = src->data_type();
 
@@ -104,7 +118,10 @@ void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups)
+Status ClWeightsReshapeKernel::validate(const ITensorInfo *src,
+                                        const ITensorInfo *biases,
+                                        const ITensorInfo *dst,
+                                        unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups));
     return Status{};
@@ -136,7 +153,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
     _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3));
     _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_window.use_tensor_dimensions(biases->info()->tensor_shape());
         biases_slice = biases_window.first_slice_window_1D();
@@ -148,7 +165,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
         unsigned idx = 0;
         add_3D_tensor_argument(idx, src, in_slice);
         add_2D_tensor_argument(idx, dst, out_slice);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, biases_slice);
             ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
@@ -156,8 +173,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
 
         // Run kernel
         enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+    } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
index 7364eb97ae..5e05f8d006 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
@@ -75,14 +75,19 @@ public:
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWeightsReshapeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -90,4 +95,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
index 0a9a3f021f..15195025ce 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -29,11 +29,11 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -63,7 +63,8 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
 }
 } // namespace
 
-Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status
+ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
     return Status{};
@@ -74,12 +75,15 @@ ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
@@ -91,11 +95,12 @@ void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
     const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -146,9 +151,11 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window
 
     Window slice = window.first_slice_window_4D();
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -159,8 +166,7 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _depth);
         _kernel.setArg<cl_int>(idx++, _input1_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
index 5c54479002..8b53d6d66b 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -62,8 +62,8 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
index 54f7ad344a..c4f84e3e45 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -30,11 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -45,15 +45,20 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status validate_arguments(const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *src3,
+                          const ITensorInfo *src4,
+                          const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) >
+                                dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -71,22 +76,29 @@ ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1,
+                                                  const ITensorInfo *src2,
+                                                  const ITensorInfo *src3,
+                                                  const ITensorInfo *src4,
+                                                  const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
     return Status{};
 }
 
 void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *src1, ITensorInfo *src2,
-                                                 ITensorInfo *src3, ITensorInfo *src4,
-                                                 ITensorInfo *dst)
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *src3,
+                                                 ITensorInfo            *src4,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
 
-    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
-    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
+    auto               padding_info = get_padding_info({src1, src2, src3, src4, dst});
+    const unsigned int min_dimension =
+        std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
     const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
 
@@ -96,9 +108,14 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" +
+                          support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) %
+                                                    num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) +
+                                                                            src3->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     _depth        = src1->dimension(2);
     _input1_width = src1->dimension(0);
@@ -106,8 +123,9 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     _input3_width = src3->dimension(0);
 
     // If soources have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    const bool have_different_qinfo =
+        helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -166,11 +184,15 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
-    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+    const auto src3 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_4D();
 
@@ -187,8 +209,7 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _input2_width);
         _kernel.setArg<cl_int>(idx++, _input3_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
index baf8d381be..f589b8ac1a 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
@@ -52,23 +52,32 @@ public:
      * @param[in]  src4            Fourth source tensor info. Data types supported: same as @p src1
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *src3,
+                   ITensorInfo            *src4,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenate4TensorsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *src3,
+                           const ITensorInfo *src4,
+                           const ITensorInfo *dst);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
-    int32_t _input2_width{ 0 };
-    int32_t _input3_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
+    int32_t _input2_width{0};
+    int32_t _input3_width{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
index 2dfe7fce52..989de4a7b7 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -53,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -74,12 +74,15 @@ Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int w
     return Status{};
 }
 
-void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            width_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
 
@@ -87,10 +90,11 @@ void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -121,8 +125,9 @@ void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
index 3ace4400e6..c10d6a4dc6 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
@@ -50,7 +50,8 @@ public:
      * @param[in,out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenateKernel::configure()
@@ -63,7 +64,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
+    int32_t _depth{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
index 7148a4c85c..58c01d4da5 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,14 +61,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd filter transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width ||
+                                input->dimension(idx_h) != kernel_size.height);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -81,11 +86,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(output);
 
-    const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
+    const unsigned int num_elems_processed_per_iteration_x =
+        input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
     const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-    const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
+    const unsigned int num_elems_read_per_iteration_z =
+        input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
 
-    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
+    Window win =
+        calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y,
+                                           num_elems_read_per_iteration_z));
     Window win_collapsed = win.collapse(win, Window::DimZ);
     return std::make_pair(Status{}, win_collapsed);
 }
@@ -96,21 +105,25 @@ ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel()
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context,
+                                                ITensorInfo            *src,
+                                                ITensorInfo            *dst,
+                                                const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Set build options
     CLBuildOptions build_opts;
 
     // For NHWC layouts pass tensor dimesions at runtime
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         _src_dim_z = src->dimension(2);
     }
@@ -125,7 +138,8 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_
     const Size2D output_tile_size = winograd_info.output_tile_size;
 
     // Create kernel
-    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
+    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -138,7 +152,9 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradFilterTransformKernel::validate(const ITensorInfo  *src,
+                                                 const ITensorInfo  *dst,
+                                                 const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
@@ -161,7 +177,7 @@ void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
     add_3D_tensor_argument(idx, dst, window_out);
-    if(src->info()->data_layout() == DataLayout::NHWC)
+    if (src->info()->data_layout() == DataLayout::NHWC)
     {
         _kernel.setArg<cl_uint>(idx++, _src_dim_z);
     }
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
index b2130304e6..6e439f0c99 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@ public:
      * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradFilterTransformKernel::configure()
@@ -72,7 +76,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int32_t _src_dim_z{ 0 };
+    int32_t _src_dim_z{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
index fab6c36032..54c48986fc 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -55,17 +56,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd input transform only supports unit strides");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd input transform not supported");
 
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(output_tile_size);
     ARM_COMPUTE_UNUSED(kernel_size);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -74,7 +79,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -82,7 +88,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     bool window_changed                    = false;
     int  num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -94,9 +100,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         const size_t   dim0 = input->dimension(0);
         const size_t   k_sz = winograd_info.kernel_size.area();
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
-            if(k_sz == 3 || k_sz == 9)
+            if (k_sz == 3 || k_sz == 9)
             {
                 num_elems_processed_per_iteration = 2;
             }
@@ -104,7 +110,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     }
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
         const PadStrideInfo conv_info        = winograd_info.convolution_info;
         const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -113,11 +119,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
         unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
 
-        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+                                           num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -132,12 +140,15 @@ BorderSize ClWinogradInputTransformKernel::border_size() const
     return _border_size;
 }
 
-void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context,
+                                               ITensorInfo            *src,
+                                               ITensorInfo            *dst,
+                                               const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -150,14 +161,13 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
     const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+                                                                kernel_size, output_tile_size, conv_info);
 
     _num_tiles_x = num_tiles.width;
     _num_tiles_y = num_tiles.height;
 
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
@@ -174,7 +184,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     _src_height = src->dimension(idx_h);
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         build_opts.add_option("-DNHWC");
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -201,13 +211,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     }
 
     // Create kernel
-    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+    std::string kernel_name =
+        "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
 
     // Get the maximum dimension from the tile size
     const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
+    if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
     {
         _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
     }
@@ -239,11 +250,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradInputTransformKernel::validate(const ITensorInfo  *src,
+                                                const ITensorInfo  *dst,
+                                                const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
     return Status{};
 }
 
@@ -263,7 +277,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         Window slice = window_collapsed.first_slice_window_3D();
         slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
@@ -298,8 +312,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
             add_3D_tensor_argument(idx, dst, slice);
 
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window_collapsed.slide_window_slice_3D(slice));
+        } while (window_collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
index c10c528b9b..cebebea1d3 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@ public:
      * @param[in] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
      * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradInputTransformKernel::configure()
@@ -69,19 +73,19 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    void       run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    BorderSize   _border_size{ 0 };
-    DataLayout   _data_layout{ DataLayout::UNKNOWN };
-    int          _num_tiles_x{ 0 };
-    int          _num_tiles_y{ 0 };
-    unsigned int _step_z{ 1 };
-    int32_t      _src_width{ 0 };
-    int32_t      _src_height{ 0 };
+    BorderSize   _border_size{0};
+    DataLayout   _data_layout{DataLayout::UNKNOWN};
+    int          _num_tiles_x{0};
+    int          _num_tiles_y{0};
+    unsigned int _step_z{1};
+    int32_t      _src_width{0};
+    int32_t      _src_height{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index bf974d30d8..89c80c55ef 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,10 +30,12 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -54,7 +55,11 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *bias,
+                          const ITensorInfo         *output,
+                          const WinogradInfo        &winograd_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
@@ -66,30 +71,32 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const unsigned int  num_channels     = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
+    const unsigned int  num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) *
+                                      (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout),
+        "Winograd output transform not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
 
     // Compute number of elements to process in the X and Y direction
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -98,14 +105,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo  *input,
+                                                        ITensorInfo  *bias,
+                                                        ITensorInfo  *output,
+                                                        const Size2D &output_tile_size)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(bias);
 
     unsigned int num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -115,7 +125,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         const DataType dt   = input->data_type();
         const size_t   dim0 = input->dimension(0);
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
             num_elems_processed_per_iteration = 2;
         }
@@ -124,17 +134,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool   window_changed = false;
 
-    if(output->data_layout() == DataLayout::NCHW)
+    if (output->data_layout() == DataLayout::NCHW)
     {
         const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
         const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
 
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration,
+                                           num_elems_processed_per_iteration);
         AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
         window_changed = update_window_and_padding(win, input_access, output_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -144,13 +156,18 @@ ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel()
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+void ClWinogradOutputTransformKernel::configure(const ClCompileContext    &compile_context,
+                                                ITensorInfo               *src,
+                                                ITensorInfo               *bias,
+                                                ITensorInfo               *dst,
+                                                const WinogradInfo        &winograd_info,
                                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
 
@@ -159,7 +176,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
 
@@ -168,14 +185,13 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
     const size_t total_batches = dst->tensor_shape().total_size_upper(3);
 
     // Set build options
@@ -184,11 +200,11 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
 
-    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+    if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
     {
         build_opts.add_option("-DVEC_SIZE=2");
     }
-    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+    else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
     {
         build_opts.add_option("-DVEC_SIZE=4");
     }
@@ -200,9 +216,10 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     const auto      act_function  = act_info.activation();
     const auto      src_data_type = src->data_type();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (src_data_type == DataType::F32 || src_data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (src_data_type == DataType::F32 || src_data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -213,7 +230,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
         build_opts.add_option("-cl-fast-relaxed-math");
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -247,7 +264,9 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     _dst_height = dst->dimension(idx_height);
 
     // Create kernel
-    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
+    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" +
+                              lower_string(string_from_data_layout(winograd_info.output_data_layout));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -271,10 +290,18 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
 }
 
-Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status ClWinogradOutputTransformKernel::validate(const ITensorInfo         *src,
+                                                 const ITensorInfo         *bias,
+                                                 const ITensorInfo         *dst,
+                                                 const WinogradInfo        &winograd_info,
+                                                 const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
+                                                              (bias != nullptr ? bias->clone().get() : nullptr),
+                                                              dst->clone().get(), winograd_info.output_tile_size)
+                                    .first);
     return Status{};
 }
 
@@ -299,7 +326,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
         Window       slice_biases;
@@ -307,7 +334,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
         add_1D_tensor_argument(idx1, bias, slice_biases);
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
         _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
@@ -322,8 +349,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
index 6f018967d0..65bb963061 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -61,7 +62,11 @@ public:
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const WinogradInfo        &winograd_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -69,7 +74,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const WinogradInfo        &winograd_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,11 +86,11 @@ public:
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    bool    _is_nhwc{ false };
-    int32_t _src_height{ 0 };
-    int32_t _dst_width{ 0 };
-    int32_t _dst_height{ 0 };
-    int32_t _num_tiles_x{ 0 };
+    bool    _is_nhwc{false};
+    int32_t _src_height{0};
+    int32_t _dst_width{0};
+    int32_t _dst_height{0};
+    int32_t _num_tiles_x{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
index 9350bf74bb..b5ebac3b49 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
@@ -39,14 +39,24 @@ namespace kernels
 {
 namespace gemm
 {
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image)
 {
     ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
     ARM_COMPUTE_ERROR_ON(v0 == 0);
     v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
 
-    if(h0 == 0)
+    if (h0 == 0)
     {
         // When h0 is 0, we should take the maximum H0 possible
         h0 = std::max(n / n0, 1U);
@@ -62,17 +72,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned
     return std::make_pair(lhs_info, rhs_info);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, "The fallback GeMM configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true,
+                             "The fallback GeMM configuration cannot have export_to_cl_image = true");
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
     const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
     const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
     {
         return info_img;
     }
@@ -90,42 +105,56 @@ void update_padding_for_cl_image(ITensorInfo *tensor)
     const unsigned int pixel_alignment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
 
     ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
-    if(pixel_alignment == 0)
+    if (pixel_alignment == 0)
     {
         return;
     }
 
     const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
-    const unsigned int round_up_width      = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
-    const unsigned int padding             = round_up_width - stride_y_in_elements;
+    const unsigned int round_up_width =
+        ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+    const unsigned int padding = round_up_width - stride_y_in_elements;
 
     tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
 }
 
 Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
 {
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, "Export to cl_image only supported with n0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, "Export to cl_image only supported with k0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false,
+                                        "Export to cl_image only supported with n0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true,
+                                        "Export to cl_image only supported with k0 = 4, 8 or 16");
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
+            "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
+                                        "Impossible to retrieve the cl_image pitch alignment");
 
         // Check the width and height of the output tensor.
         // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
         const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
         const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4,
+                                        "Not supported width for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h,
+            "Not supported height for cl_image");
     }
 
     return Status{};
 }
 
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0)
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0)
 {
     ARM_COMPUTE_UNUSED(n, k, b, data_type);
 
@@ -141,7 +170,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const
     return ((k % mmul_k0) == 0) && (gws_y > 4);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -150,12 +180,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, "
+                                              "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -168,7 +199,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
index 6689b10e69..84776fb207 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
@@ -54,8 +54,18 @@ using GeMMConfigsMatrix = std::vector<std::vector<int32_t>>;
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image = false);
 
 /** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  *
@@ -72,9 +82,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type);
 
 /** Update padding required to export the OpenCL buffer to OpenCL image2d
  *
@@ -103,8 +117,13 @@ Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info,
  *
  * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise
  */
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0);
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0);
 
 /** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user
  *
@@ -116,7 +135,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
index a49836cfda..9d08633963 100644
--- a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 #include <array>
@@ -56,8 +57,7 @@ public:
      * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -69,7 +69,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -96,8 +96,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClGemmKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClGemmKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
@@ -111,7 +110,8 @@ public:
      * @param[in] b         Batch size
      * @param[in] data_type Data type
      */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
index d74c7fac9b..2f37eef31f 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,31 +39,34 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -79,18 +83,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -105,20 +110,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -129,7 +135,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -141,9 +147,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -159,24 +165,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 4196)
+        if (n > 4196)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
         }
         else
         {
-            if(k < 2048)
+            if (k < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
             }
-            else if(k >= 2048 && k < 16384)
+            else if (k >= 2048 && k < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
             }
@@ -192,18 +199,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 16384)
+        else if (n >= 2048 && n < 16384)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
         }
@@ -214,7 +222,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
     else
     {
-        if(m < 64)
+        if (m < 64)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
         }
@@ -225,7 +233,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -233,7 +242,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -243,4 +253,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
index 9af5dc4135..f822daae53 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
@@ -45,15 +45,22 @@ public:
     ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
index b9f36c7210..f87fb1b659 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,18 +39,17 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr,
-                                                                        nullptr,
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr,
                                                                         &ClGemmDefaultConfigNativeMidgard::default_q8);
 
     auto func = configs_default.get_function(data_type);
@@ -57,7 +57,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -70,4 +71,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
index c055753c48..fa76c5dba7 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
@@ -45,10 +45,12 @@ public:
     ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
index 95a4d2bd69..97a1298b0a 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,37 +39,38 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
+        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
 
     auto func = configs_default.get_function(data_type);
     ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -83,18 +85,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -109,20 +112,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -133,7 +137,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -145,9 +149,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -165,4 +169,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
index f0f812fd46..c91b095279 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
@@ -45,12 +45,16 @@ public:
     ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
index cf8412830b..955bb3c01a 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
@@ -51,7 +51,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
index 657018eb53..c956c347ef 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -43,30 +44,31 @@ namespace gemm
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -83,12 +85,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -98,12 +101,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
     }
@@ -113,14 +117,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
         }
@@ -131,7 +136,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
     else
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
         }
@@ -142,7 +147,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -154,100 +160,108 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(workload <= 274.4000f)
+    if (workload <= 274.4000f)
     {
-        if(r_nk <= 0.7461f)
+        if (r_nk <= 0.7461f)
         {
-            if(r_mn <= 21.1667f)
+            if (r_mn <= 21.1667f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(r_mk <= 17.3926f)
+        if (r_mk <= 17.3926f)
         {
-            if(workload <= 542.4000f)
+            if (workload <= 542.4000f)
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_nk <= 0.5463f)
+            if (r_nk <= 0.5463f)
             {
-                if(workload <= 11767.6001f)
+                if (workload <= 11767.6001f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 323.4000f)
+    if (workload <= 323.4000f)
     {
         return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
     }
@@ -257,7 +271,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -268,7 +283,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     GEMMRHSMatrixInfo rhs_info_img;
 
     // Get lhs_info/rhs_info in case of OpenCL buffer
-    if(n <= 4)
+    if (n <= 4)
     {
         std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -279,15 +294,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
 
     // Get lhs_info/rhs_info in case of OpenCL image
     // Condition on the GPU workload
-    if((m / 4) * (n / 4) >= 2560)
+    if ((m / 4) * (n / 4) >= 2560)
     {
         // Big workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
     }
     else
     {
         // Small workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -297,7 +314,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = (n <= 4) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -307,16 +324,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-    if(workload <= 1595.2000f)
+    if (workload <= 1595.2000f)
     {
-        if(r_mk <= 2.1044f)
+        if (r_mk <= 2.1044f)
         {
-            if(workload <= 870.4000f)
+            if (workload <= 870.4000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
             }
@@ -336,12 +354,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
index d86d1ba0a7..9227ec2551 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
@@ -45,17 +45,26 @@ public:
     ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
index 58d0873b86..70b324eb5a 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,26 +39,27 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -72,12 +74,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
     }
@@ -87,7 +90,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -104,17 +108,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
 
     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
 
-    if(r_mk <= 0.11824845522642136)
+    if (r_mk <= 0.11824845522642136)
     {
-        if(workload <= 880.0)
+        if (workload <= 880.0)
         {
             return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
         }
         else
         {
-            if(r_nk <= 0.42521367967128754)
+            if (r_nk <= 0.42521367967128754)
             {
-                if(workload <= 1726.4000244140625)
+                if (workload <= 1726.4000244140625)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
                 }
@@ -123,13 +127,12 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
             {
-                if(workload <= 1241.6000366210938)
+                if (workload <= 1241.6000366210938)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
                 }
@@ -142,17 +145,16 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
     else
     {
-        if(workload <= 11404.7998046875)
+        if (workload <= 11404.7998046875)
         {
-            if(r_mk <= 1.0126488208770752)
+            if (r_mk <= 1.0126488208770752)
             {
-                if(r_mn <= 2.545312523841858)
+                if (r_mn <= 2.545312523841858)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -161,43 +163,39 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(workload <= 2881.199951171875)
+                if (workload <= 2881.199951171875)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
         else
         {
-            if(r_nk <= 0.5765306055545807)
+            if (r_nk <= 0.5765306055545807)
             {
-                if(r_mn <= 6.010416746139526)
+                if (r_mn <= 6.010416746139526)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
@@ -205,27 +203,27 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 1288.0000f)
+    if (workload <= 1288.0000f)
     {
-        if(workload <= 505.6000f)
+        if (workload <= 505.6000f)
         {
-            if(r_mn <= 0.4466f)
+            if (r_mn <= 0.4466f)
             {
-                if(r_nk <= 0.2384f)
+                if (r_nk <= 0.2384f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -241,9 +239,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(r_mn <= 0.2250f)
+            if (r_mn <= 0.2250f)
             {
-                if(r_mn <= 0.1599f)
+                if (r_mn <= 0.1599f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -254,11 +252,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(r_mk <= 0.7609f)
+                if (r_mk <= 0.7609f)
                 {
-                    if(r_mn <= 2.5453f)
+                    if (r_mn <= 2.5453f)
                     {
-                        if(workload <= 1089.6000f)
+                        if (workload <= 1089.6000f)
                         {
                             return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                         }
@@ -281,29 +279,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
     else
     {
-        if(workload <= 5434.4001f)
+        if (workload <= 5434.4001f)
         {
-            if(workload <= 1603.2000f)
+            if (workload <= 1603.2000f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.6192f)
+                if (r_nk <= 0.6192f)
                 {
-                    if(r_mn <= 16.1016f)
+                    if (r_mn <= 16.1016f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(workload <= 2750.0000f)
+                        if (workload <= 2750.0000f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(r_mk <= 6.3151f)
+                            if (r_mk <= 6.3151f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                             }
@@ -316,15 +314,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(r_mk <= 0.0387f)
+                    if (r_mk <= 0.0387f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(r_mk <= 2.5859f)
+                        if (r_mk <= 2.5859f)
                         {
-                            if(r_mk <= 0.2734f)
+                            if (r_mk <= 0.2734f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -343,13 +341,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(r_mk <= 25.7500f)
+            if (r_mk <= 25.7500f)
             {
-                if(r_mk <= 0.3615f)
+                if (r_mk <= 0.3615f)
                 {
-                    if(r_mn <= 0.0913f)
+                    if (r_mn <= 0.0913f)
                     {
-                        if(r_mk <= 0.0683f)
+                        if (r_mk <= 0.0683f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -365,15 +363,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 11174.3999f)
+                    if (workload <= 11174.3999f)
                     {
-                        if(r_mk <= 0.8047f)
+                        if (r_mk <= 0.8047f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(workload <= 7185.5999f)
+                            if (workload <= 7185.5999f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -385,9 +383,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     }
                     else
                     {
-                        if(workload <= 17917.5000f)
+                        if (workload <= 17917.5000f)
                         {
-                            if(r_mk <= 1.5078f)
+                            if (r_mk <= 1.5078f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -398,7 +396,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                         }
                         else
                         {
-                            if(workload <= 34449.6016f)
+                            if (workload <= 34449.6016f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -412,11 +410,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(r_mk <= 331.1111f)
+                if (r_mk <= 331.1111f)
                 {
-                    if(workload <= 53397.5996f)
+                    if (workload <= 53397.5996f)
                     {
-                        if(r_mn <= 57.8063f)
+                        if (r_mn <= 57.8063f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
@@ -427,7 +425,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     }
                     else
                     {
-                        if(r_nk <= 0.9211f)
+                        if (r_nk <= 0.9211f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -439,7 +437,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 38070.4004f)
+                    if (workload <= 38070.4004f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                     }
@@ -453,27 +451,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 801.6000f)
+    if (workload <= 801.6000f)
     {
         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
     }
     else
     {
-        if(r_mn <= 0.1211f)
+        if (r_mn <= 0.1211f)
         {
-            if(workload <= 3296.0000f)
+            if (workload <= 3296.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 1.0625f)
+                if (r_nk <= 1.0625f)
                 {
                     return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                 }
@@ -485,15 +484,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(workload <= 5068.8000f)
+            if (workload <= 5068.8000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.2361f)
+                if (r_nk <= 0.2361f)
                 {
-                    if(workload <= 12630.0000f)
+                    if (workload <= 12630.0000f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
                     }
@@ -504,7 +503,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 178790.3984f)
+                    if (workload <= 178790.3984f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
@@ -518,12 +517,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
index 466eda00a6..5f62efb59e 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
@@ -45,14 +45,20 @@ public:
     ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
index 1c32f1358b..83928b3f4f 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
@@ -50,7 +50,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
index 9c23d9c998..c4825bfbeb 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
@@ -29,7 +29,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -47,33 +49,39 @@ ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBif
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -96,14 +104,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n <= 2548)
+        if (n <= 2548)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
         }
@@ -118,12 +127,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -131,7 +141,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
         }
@@ -142,7 +152,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -154,9 +165,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
 
     const bool is_workload_big = ((m * n * b) / 16) >= 2048;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n >= 8192)
+        if (n >= 8192)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
@@ -164,7 +175,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         else
         {
             const unsigned int h0 = std::max(n / 2, 1U);
-            if(n <= 204)
+            if (n <= 204)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
             }
@@ -177,25 +188,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-        if(is_workload_big)
+        if (is_workload_big)
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
         }
         else
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
         }
     }
 
     // Get lhs_info/rhs_info in case of OpenCL image
     const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-    if(is_workload_big)
+    if (is_workload_big)
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
     }
     else
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -205,7 +220,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -215,7 +230,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
@@ -225,46 +241,49 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(r_nk <= 0.4664f)
+        if (r_nk <= 0.4664f)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(workload <= 274.4000f)
+        if (workload <= 274.4000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -276,14 +295,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 2048)
+        if (n > 2048)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
@@ -300,7 +320,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -312,57 +333,59 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
 
-        if(r_mk <= 0.0026f)
+        if (r_mk <= 0.0026f)
         {
-            if(r_nk <= 0.4664f)
+            if (r_nk <= 0.4664f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
         else
         {
-            if(r_mk <= 0.0148f)
+            if (r_mk <= 0.0148f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
     else
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
 
-        if(workload <= 362.6000f)
+        if (workload <= 362.6000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
         }
         else
         {
-            if(r_mn <= 22.6067f)
+            if (r_mn <= 22.6067f)
             {
-                if(workload <= 708.8000f)
+                if (workload <= 708.8000f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -371,27 +394,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_nk <= 0.0917f)
+                if (r_nk <= 0.0917f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
-    if(m == 1)
+    if (m == 1)
     {
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
     }
@@ -400,15 +424,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-        if(workload <= 7449.60f)
+        if (workload <= 7449.60f)
         {
-            if(workload <= 691.60f)
+            if (workload <= 691.60f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
             }
             else
             {
-                if(workload <= 4155.20f)
+                if (workload <= 4155.20f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
                 }
@@ -420,21 +444,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 16300.80f)
+            if (workload <= 16300.80f)
             {
-                if(r_mn <= 44.56f)
+                if (r_mn <= 44.56f)
                 {
                     GEMMLHSMatrixInfo lhs_info_buf;
                     GEMMRHSMatrixInfo rhs_info_buf;
                     GEMMLHSMatrixInfo lhs_info_img;
                     GEMMRHSMatrixInfo rhs_info_img;
 
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -448,23 +473,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -476,14 +503,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
             const unsigned int h0 = std::max(n / 2, 1U);
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -497,7 +525,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
-        if(m == 1)
+        if (m == 1)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
         }
@@ -508,12 +536,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -524,12 +553,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
index 321cbb5250..77c0c8d500 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
@@ -45,21 +45,34 @@ public:
     ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
index d08bf84c72..da3e2ec912 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
@@ -50,30 +50,35 @@ ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyVal
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -96,29 +101,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    if(m == 1)
+    if (m == 1)
     {
         const float r_mn = static_cast<float>(m) / static_cast<float>(n);
         const float r_mk = static_cast<float>(m) / static_cast<float>(k);
 
-        if(r_mk <= 0.0064484127797186375)
+        if (r_mk <= 0.0064484127797186375)
         {
-            if(r_mn <= 0.0028273810748942196)
+            if (r_mn <= 0.0028273810748942196)
             {
                 GEMMLHSMatrixInfo lhs_info_buf;
                 GEMMRHSMatrixInfo rhs_info_buf;
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                const unsigned int h0 = std::max(n / 4, 1U);
+                const unsigned int h0                = std::max(n / 4, 1U);
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
@@ -127,7 +132,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(r_mk <= 0.020312500186264515)
+            if (r_mk <= 0.020312500186264515)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
             }
@@ -143,9 +148,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
         const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-        if(workload <= 1999.2000122070312)
+        if (workload <= 1999.2000122070312)
         {
-            if(workload <= 747.1999816894531)
+            if (workload <= 747.1999816894531)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
             }
@@ -159,15 +164,14 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_mn <= 0.03348214365541935)
+            if (r_mn <= 0.03348214365541935)
             {
-                if(r_mk <= 0.028125000186264515)
+                if (r_mk <= 0.028125000186264515)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
                 }
@@ -181,8 +185,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
@@ -195,168 +198,112 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_small_fallback =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 },
-        { 16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0},    {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},    {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0},    {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0},
+        {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                             {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0},
+                                                             {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0},
+                                                             {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},   {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},  {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},  {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_fallback;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -381,17 +328,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -399,7 +346,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
         }
@@ -410,30 +357,31 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(workload <= 278.7000f)
+        if (workload <= 278.7000f)
         {
-            if(workload <= 7.5000f)
+            if (workload <= 7.5000f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
             }
             else
             {
-                if(r_mn <= 0.0031f)
+                if (r_mn <= 0.0031f)
                 {
-                    if(workload <= 256.6000f)
+                    if (workload <= 256.6000f)
                     {
-                        if(workload <= 16.7500f)
+                        if (workload <= 16.7500f)
                         {
-                            if(r_nk <= 1.6671f)
+                            if (r_nk <= 1.6671f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -454,15 +402,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 }
                 else
                 {
-                    if(r_mk <= 0.0027f)
+                    if (r_mk <= 0.0027f)
                     {
-                        if(r_mk <= 0.0014f)
+                        if (r_mk <= 0.0014f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                         }
                         else
                         {
-                            if(workload <= 8.9500f)
+                            if (workload <= 8.9500f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                             }
@@ -474,13 +422,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                     }
                     else
                     {
-                        if(workload <= 14.1500f)
+                        if (workload <= 14.1500f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                         }
                         else
                         {
-                            if(r_mk <= 0.0041f)
+                            if (r_mk <= 0.0041f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -495,9 +443,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 363.7000f)
+            if (workload <= 363.7000f)
             {
-                if(r_mk <= 0.0031f)
+                if (r_mk <= 0.0031f)
                 {
                     return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
                 }
@@ -514,9 +462,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(workload <= 704.0000f)
+            if (workload <= 704.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
             }
@@ -527,9 +475,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 16761.6006f)
+            if (workload <= 16761.6006f)
             {
-                if(r_mn <= 187.1250f)
+                if (r_mn <= 187.1250f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -540,7 +488,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_mk <= 432.4630f)
+                if (r_mk <= 432.4630f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -553,42 +501,37 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
 
-    if(m == 1)
+    if (m == 1)
     {
-        const GeMMConfigsMatrix configs_mnkb_best =
-        {
-            { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }
-        };
+        const GeMMConfigsMatrix configs_mnkb_best = {
+            {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0},   {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0},
+            {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}};
 
         return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b);
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(r_nk <= 0.8333f)
+            if (r_nk <= 0.8333f)
             {
-                if(r_mk <= 0.9119f)
+                if (r_mk <= 0.9119f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1);
                 }
                 else
                 {
-                    if(r_nk <= 0.1181f)
+                    if (r_nk <= 0.1181f)
                     {
                         return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
                     }
@@ -600,7 +543,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_mk <= 1.0013f)
+                if (r_mk <= 1.0013f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
                 }
@@ -612,11 +555,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 11404.7998f)
+            if (workload <= 11404.7998f)
             {
-                if(r_mk <= 2.2884f)
+                if (r_mk <= 2.2884f)
                 {
-                    if(r_nk <= 0.9286f)
+                    if (r_nk <= 0.9286f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -632,9 +575,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_nk <= 1.1926f)
+                if (r_nk <= 1.1926f)
                 {
-                    if(r_mn <= 1385.7917f)
+                    if (r_mn <= 1385.7917f)
                     {
                         return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -652,12 +595,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }
@@ -667,153 +611,101 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},    {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},   {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},  {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},   {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},
+        {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0},   {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1 },
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},  {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1},  {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},  {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}};
 
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},  {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_best;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -838,17 +730,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
index f2952a3d30..a0ea337eb1 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
@@ -45,17 +45,26 @@ public:
     ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
index 1503e74eb6..e07ad993ed 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
@@ -50,7 +50,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
index 2407c6ca5e..689a743fdf 100644
--- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
@@ -36,7 +36,9 @@ namespace opencl
 {
 namespace kernels
 {
-Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info)
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
+                                    const MatMulKernelInfo &matmul_kernel_info)
 {
     const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
     const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
@@ -46,7 +48,7 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
 
     constexpr size_t batch_dim_start = 2;
-    for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
     }
@@ -54,9 +56,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
-                                                                         const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
-                                                                         int mmul_m0, int mmul_n0)
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0)
 {
     ARM_COMPUTE_UNUSED(lhs, rhs);
 
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
index 210f22b109..c2ae2a67f4 100644
--- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
@@ -44,7 +44,8 @@ namespace kernels
  *
  * @return true if the shapes and matmul kernel info matches
  */
-Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape,
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
                                     const MatMulKernelInfo &matmul_kernel_info);
 
 /** Validate and configure window for Matmul MMUL kernels
@@ -58,9 +59,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh
  *
  * @return a pair of Status and Window object
  */
-std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
-                                                                         const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
-                                                                         int mmul_m0, int mmul_n0);
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0);
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
index 74a818d738..66877ebcec 100644
--- a/src/gpu/cl/operators/ClActivation.cpp
+++ b/src/gpu/cl/operators/ClActivation.cpp
@@ -23,19 +23,21 @@
  */
 #include "src/gpu/cl/operators/ClActivation.h"
 
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClActivationKernel.h"
-
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/ClContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClActivation::configure(const ClCompileContext    &compile_context,
+                             ITensorInfo               *src,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, act_info);
     auto k = std::make_unique<kernels::ClActivationKernel>();
@@ -53,13 +55,17 @@ namespace gpu
 {
 namespace opencl
 {
-std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor     &src,
+                                                                 const AclTensorDescriptor     &dst,
+                                                                 const AclActivationDescriptor &act,
+                                                                 bool                           is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
     TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
     auto       info     = detail::convert_to_activation_info(act);
 
-    if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false),
+                                                                         &dst_info.set_is_resizable(false), info)))
     {
         return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
     }
@@ -68,7 +74,7 @@ std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensor
     act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
 
     auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
+    if (op == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return std::make_tuple(nullptr, StatusCode::OutOfMemory);
diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h
index 348dc27929..4f25bb5f24 100644
--- a/src/gpu/cl/operators/ClActivation.h
+++ b/src/gpu/cl/operators/ClActivation.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -43,7 +44,10 @@ public:
      * @param[out] dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]  activation_info Activation layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &activation_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivation::configure()
diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
index b9bf505bba..b58d0df58d 100644
--- a/src/gpu/cl/operators/ClAdd.cpp
+++ b/src/gpu/cl/operators/ClAdd.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClAdd.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClAdd::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@ void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClAdd::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
 }
diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h
index a17ce7b5d6..7aed902f5d 100644
--- a/src/gpu/cl/operators/ClAdd.h
+++ b/src/gpu/cl/operators/ClAdd.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ADD_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
index 05ea21b734..8f26ef003d 100644
--- a/src/gpu/cl/operators/ClCast.cpp
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClCast.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCastKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCast::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
     auto k = std::make_unique<kernels::ClCastKernel>();
diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
index 1b67ff7c8e..25d2293673 100644
--- a/src/gpu/cl/operators/ClCast.h
+++ b/src/gpu/cl/operators/ClCast.h
@@ -58,7 +58,8 @@ public:
      * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCast::configure()
diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
index a27fc37cc4..31018b9768 100644
--- a/src/gpu/cl/operators/ClConcatenate.cpp
+++ b/src/gpu/cl/operators/ClConcatenate.cpp
@@ -23,9 +23,14 @@
  */
 #include "src/gpu/cl/operators/ClConcatenate.h"
 
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
@@ -33,42 +38,39 @@
 #include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
 #include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include "src/common/utils/Log.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
+void ClConcatenate::configure(const CLCompileContext           &compile_context,
+                              const std::vector<ITensorInfo *> &src_vector,
+                              ITensorInfo                      *dst,
+                              size_t                            axis)
 {
     ARM_COMPUTE_ERROR_ON(dst == nullptr);
     ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis);
     _axis       = axis;
     _num_inputs = src_vector.size();
 
-    TensorShape                      dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
     std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
-    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t;
-    });
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(),
+                   [](ITensorInfo *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t;
+                   });
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
     ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
 
     unsigned int offset = 0;
-    switch(_axis)
+    switch (_axis)
     {
         case Window::DimX:
         {
-            switch(_num_inputs)
+            switch (_num_inputs)
             {
                 case 2:
                 {
@@ -82,14 +84,15 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
                 {
                     // Configure WidthConcatenate4Tensors kernel
                     auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2),
+                                      src_vector.at(3), dst);
                     _concat_kernels.emplace_back(std::move(kernel));
                     break;
                 }
                 default:
                 {
                     // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    for (unsigned int i = 0; i < _num_inputs; ++i)
                     {
                         auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
                         kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -103,7 +106,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case Window::DimY:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -114,7 +117,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case Window::DimZ:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -125,7 +128,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case 3:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -148,25 +151,27 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
     ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     unsigned int offset = 0;
-    switch(axis)
+    switch (axis)
     {
         case Window::DimX:
         {
-            switch(num_inputs)
+            switch (num_inputs)
             {
                 case 2:
                     // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(
+                        kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
                     break;
                 case 4:
                     // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(
+                        src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
                     break;
                 default:
                     // Validate generic case of WidthConcatenate kernel
-                    for(const auto &src : src_vector)
+                    for (const auto &src : src_vector)
                     {
                         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
                         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
@@ -178,7 +183,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case Window::DimY:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -187,7 +192,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case Window::DimZ:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -196,7 +201,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case 3:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -207,7 +212,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
             ARM_COMPUTE_ERROR("Axis not supported");
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
         ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -218,17 +223,17 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
 
 void ClConcatenate::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
     {
         ARM_COMPUTE_ERROR("Configured with different number of inputs");
     }
 
-    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
     {
         ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
         CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
@@ -236,7 +241,7 @@ void ClConcatenate::run(ITensorPack &tensors)
     else
     {
         int i = 0;
-        for(auto &k : _concat_kernels)
+        for (auto &k : _concat_kernels)
         {
             ITensorPack pack;
             pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h
index de0cf84d2c..d8ce9d2a5c 100644
--- a/src/gpu/cl/operators/ClConcatenate.h
+++ b/src/gpu/cl/operators/ClConcatenate.h
@@ -57,7 +57,10 @@ public:
      * @param[out]    dst             Destination tensor info. Data types supported: same as @p src_vector.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
+    void configure(const ClCompileContext           &compile_context,
+                   const std::vector<ITensorInfo *> &src_vector,
+                   ITensorInfo                      *dst,
+                   size_t                            axis);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConcatenate::configure()
@@ -71,8 +74,8 @@ public:
 
 private:
     std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
-    unsigned int                            _num_inputs{ 0 };
-    unsigned int                            _axis{ 0 };
+    unsigned int                            _num_inputs{0};
+    unsigned int                            _axis{0};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index eb9475ccaa..2c3b0214fa 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -23,17 +23,17 @@
  */
 #include "src/gpu/cl/operators/ClConv2d.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/operators/ClDirectConv2d.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "src/gpu/cl/operators/ClIndirectConv2d.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 
 namespace
@@ -48,7 +48,7 @@ namespace
  */
 size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
 {
-    switch(gpu_target)
+    switch (gpu_target)
     {
         case arm_compute::GPUTarget::G76:
         case arm_compute::GPUTarget::G77:
@@ -71,27 +71,33 @@ namespace opencl
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClConv2d::ClConv2d()
-    : _operator()
+ClConv2d::ClConv2d() : _operator()
 {
 }
 
 ClConv2d::~ClConv2d() = default;
 
-void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                         const WeightsInfo &weights_info)
+void ClConv2d::configure(const CLCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *weights,
+                         ITensorInfo            *biases,
+                         ITensorInfo            *dst,
+                         const Conv2dInfo       &conv2d_info,
+                         const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
             auto f = std::make_unique<ClWinogradConv2d>();
-            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info,
+                         conv2d_info.enable_fast_math);
             _operator = std::move(f);
             break;
         }
@@ -125,35 +131,46 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
     _aux_mem = _operator->workspace();
 }
 
-Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClConv2d::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv2dInfo  &conv2d_info,
                           const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             //Validate Winograd
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info,
+                                                                   conv2d_info.act_info, conv2d_info.enable_fast_math));
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::INDIRECT:
         {
             // Validate indirect convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
@@ -170,8 +187,12 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
     return Status{};
 }
 
-ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                   const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src,
+                                                   const ITensorInfo *weights,
+                                                   const ITensorInfo *dst,
+                                                   const Conv2dInfo  &conv2d_info,
+                                                   const WeightsInfo &weights_info,
+                                                   const GPUTarget    gpu_target)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
@@ -191,20 +212,35 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
 
-    const std::vector<ConfigurationMethod> known_configs =
-    {
+    const std::vector<ConfigurationMethod> known_configs = {
         // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
     };
 
     const auto find_config = [&](ConfigurationMethod c)
@@ -213,76 +249,89 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
         const PadStrideInfo            info        = std::get<3>(config);
         const DataLayout               data_layout = std::get<4>(config);
 
-        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout());
+        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride() && (data_layout == src->data_layout());
     };
 
     std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
     {
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U))
+    if (dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
     else
     {
-        if(src->data_layout() == DataLayout::NCHW)
+        if (src->data_layout() == DataLayout::NCHW)
         {
             // SRGAN
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) &&
+                (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
             {
                 return ConvolutionMethod::DIRECT;
             }
-            if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
+            if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) &&
+                (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
             {
                 return ConvolutionMethod::FFT;
             }
-            if(src->dimension(idx_c) < 16)
+            if (src->dimension(idx_c) < 16)
             {
                 return ConvolutionMethod::GEMM;
             }
-            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))
+                       ? ConvolutionMethod::WINOGRAD
+                       : ConvolutionMethod::GEMM;
         }
         else
         {
-            const bool   is_direct_valid           = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
-            const bool   is_wino_valid             = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
+            const bool is_direct_valid =
+                bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+            const bool is_wino_valid =
+                bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
             const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
 
             // SRGAN case
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && is_direct_valid)
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) && is_direct_valid)
             {
                 return ConvolutionMethod::DIRECT;
             }
 
             // Floating-point case: GeMM/Direct/Winograd
-            if(is_data_type_float(src->data_type()))
+            if (is_data_type_float(src->data_type()))
             {
                 // Get dst shape
-                TensorShape   output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-                const bool    is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool    is_ifm_ge_8        = src->dimension(idx_c) >= 8;
-                const bool    is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool    is_ofm_lte_8       = weights->dimension(3U) <= 8;
-                const bool    is_ofm_lt_64       = weights->dimension(3U) < 64;
-                const bool    workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
-                const bool    is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
-                const bool    is_m_one           = output_shape[1] * output_shape[2] == 1;
-                const bool    is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
-                const int32_t kernel_sz          = weights->dimension(idx_w) * weights->dimension(idx_h);
+                TensorShape output_shape =
+                    misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) &&
+                                                (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool is_ifm_ge_8       = src->dimension(idx_c) >= 8;
+                const bool is_ifm_ge_16      = src->dimension(idx_c) >= 16;
+                const bool is_ofm_lte_8      = weights->dimension(3U) <= 8;
+                const bool is_ofm_lt_64      = weights->dimension(3U) < 64;
+                const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool is_ifm_gt_ofm     = src->dimension(idx_c) > weights->dimension(3U);
+                const bool is_m_one          = output_shape[1] * output_shape[2] == 1;
+                const bool is_unit_stride =
+                    (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
 
                 // Run Winograd if valid and IFM >= 8
-                if(is_wino_valid && is_ifm_ge_8)
+                if (is_wino_valid && is_ifm_ge_8)
                 {
-                    if(is_ofm_lte_8)
+                    if (is_ofm_lte_8)
                     {
-                        if(gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
+                        if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                            get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
                         {
                             return ConvolutionMethod::WINOGRAD;
                         }
@@ -294,18 +343,19 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                 }
 
                 // Direct convolution case
-                if(is_direct_valid)
+                if (is_direct_valid)
                 {
-                    if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
+                    if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                         get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
                     {
-                        if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
                         {
                             return ConvolutionMethod::DIRECT;
                         }
                     }
-                    else if(gpu_target == arm_compute::GPUTarget::G76)
+                    else if (gpu_target == arm_compute::GPUTarget::G76)
                     {
-                        if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
                         {
                             return ConvolutionMethod::DIRECT;
                         }
@@ -314,21 +364,24 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                     {
                         ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT;
 
-                        const bool is_indirect_valid = bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+                        const bool is_indirect_valid =
+                            bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
 
                         // indirect conv2d should be called when:
                         // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81)
                         // 2- When the kernel size is odd
                         // 3- When the Gpu target is Arm Mali-G77
-                        if(is_indirect_valid)
+                        if (is_indirect_valid)
                         {
                             const bool is_kernel_sz_odd = kernel_sz % 2;
                             const bool is_g77           = gpu_target == GPUTarget::G77;
-                            preferred_conv_method       = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77
+                                                        ? ConvolutionMethod::INDIRECT
+                                                        : ConvolutionMethod::DIRECT;
                         }
 
                         // Direct/indirect convolution used for the first layer of the network
-                        if(workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
+                        if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
                         {
                             // In general, the question we should ask for the first convolution layer of a model is:
                             // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that
@@ -337,13 +390,13 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                             return preferred_conv_method;
                         }
 
-                        if((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
+                        if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
                         {
                             return preferred_conv_method;
                         }
 
                         // Direct convolution used for the last layer of the network
-                        if(is_ofm_lte_8)
+                        if (is_ofm_lte_8)
                         {
                             return preferred_conv_method;
                         }
diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h
index c6c366a762..0cf3cbc1ce 100644
--- a/src/gpu/cl/operators/ClConv2d.h
+++ b/src/gpu/cl/operators/ClConv2d.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -112,15 +113,24 @@ public:
      * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
      *
      * Similar to ClConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
     /** Static function to check if given info will return the convolution called by @ref ClConv2d
      *
@@ -137,11 +147,15 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                    const WeightsInfo &weights_info, const GPUTarget gpu_target);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+                                                    const ITensorInfo *weights,
+                                                    const ITensorInfo *dst,
+                                                    const Conv2dInfo  &conv2d_info,
+                                                    const WeightsInfo &weights_info,
+                                                    const GPUTarget    gpu_target);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
index 08122b6852..cf24c68d21 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
@@ -23,16 +23,19 @@
  */
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context,
+                                               const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const TensorShape      &original_src_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
@@ -40,9 +43,12 @@ void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_c
     _kernel = std::move(k);
 }
 
-Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                const ITensorInfo *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
 {
     return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
index 2794eb17b0..c46152081c 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
@@ -43,14 +43,21 @@ public:
      * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in] data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeights::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
index d3b83040d0..e2be7cebd4 100644
--- a/src/gpu/cl/operators/ClCopy.cpp
+++ b/src/gpu/cl/operators/ClCopy.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClCopy.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCopyKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *
     return kernels::ClCopyKernel::validate(src, dst, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h
index 9b427f9675..fe9b58c607 100644
--- a/src/gpu/cl/operators/ClCopy.h
+++ b/src/gpu/cl/operators/ClCopy.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COPY_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@ public:
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopy::configure()
diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
index cef9f14c7d..6313e4fbb5 100644
--- a/src/gpu/cl/operators/ClCrop.cpp
+++ b/src/gpu/cl/operators/ClCrop.cpp
@@ -23,17 +23,22 @@
  */
 #include "src/gpu/cl/operators/ClCrop.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCropKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void ClCrop::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
     auto k = std::make_unique<kernels::ClCropKernel>();
@@ -41,9 +46,15 @@ void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInf
     _kernel = std::move(k);
 }
 
-Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCrop::validate(const ITensorInfo *src,
+                        const ITensorInfo *dst,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h
index 1cf1c9bff4..e845cf372c 100644
--- a/src/gpu/cl/operators/ClCrop.h
+++ b/src/gpu/cl/operators/ClCrop.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_CROP_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,16 +50,27 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCrop::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
index 0fccab63e0..eb6f9e7abb 100644
--- a/src/gpu/cl/operators/ClDequantize.cpp
+++ b/src/gpu/cl/operators/ClDequantize.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
index 0215dba422..17a196ce6b 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -35,8 +37,6 @@
 #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
 #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::cl_direct_conv;
 
 namespace arm_compute
@@ -53,7 +53,8 @@ ITensorPack select_activation_src_dst(ITensorPack &tensors)
     return pack;
 }
 
-DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -65,8 +66,13 @@ DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *sr
 
 } // namespace
 
-void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClDirectConv2d::configure(const CLCompileContext    &compile_context,
+                               ITensorInfo               *src,
+                               ITensorInfo               *weights,
+                               ITensorInfo               *biases,
+                               ITensorInfo               *dst,
+                               const PadStrideInfo       &conv_info,
+                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -75,15 +81,17 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
     // Configure direct convolution kernel
-    const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
-    auto                      k               = std::make_unique<kernels::ClDirectConv2dKernel>();
+    const ActivationLayerInfo conv2d_act_info =
+        (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info
+                                                                                         : ActivationLayerInfo();
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc);
     _direct_conv_kernel = std::move(k);
 
     // Configure border handler
     PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         zero_value = PixelValue(0, src->data_type(), src->quantization_info());
     }
@@ -92,7 +100,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     _src_border_handler = std::move(b);
 
     // Fused activation is currently supported for NHWC and floating point types
-    if(act_info.enabled() && !conv2d_act_info.enabled())
+    if (act_info.enabled() && !conv2d_act_info.enabled())
     {
         auto a = std::make_unique<kernels::ClActivationKernel>();
         a->configure(compile_context, dst, dst, act_info);
@@ -103,14 +111,19 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
 }
 
-Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClDirectConv2d::validate(const ITensorInfo         *src,
+                                const ITensorInfo         *weights,
+                                const ITensorInfo         *biases,
+                                const ITensorInfo         *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
-    if(act_info.enabled())
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
+    if (act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
     }
@@ -124,7 +137,7 @@ void ClDirectConv2d::run(ITensorPack &tensors)
     // Run direct convolution
     CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
     // Run activation kernel
-    if(_activation_kernel)
+    if (_activation_kernel)
     {
         auto act_pack = select_activation_src_dst(tensors);
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h
index fedb9e971e..0f18490814 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.h
+++ b/src/gpu/cl/operators/ClDirectConv2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -59,7 +60,12 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -67,16 +73,20 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
-    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel> _src_border_handler{nullptr};
+    std::unique_ptr<IClKernel> _activation_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp
index 5d37f07f31..b08347936b 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv3d.cpp
@@ -24,13 +24,19 @@
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClDirectConv3d::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+void ClDirectConv3d::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *src0,
+                               const ITensorInfo      *src1,
+                               const ITensorInfo      *src2,
+                               ITensorInfo            *dst,
+                               const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0);
 
@@ -40,7 +46,11 @@ void ClDirectConv3d::configure(const CLCompileContext &compile_context, const IT
     _direct_conv3d_kernel = std::move(k);
 }
 
-Status ClDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3d::validate(const ITensorInfo *src0,
+                                const ITensorInfo *src1,
+                                const ITensorInfo *src2,
+                                const ITensorInfo *dst,
+                                const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info));
     return Status{};
diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h
index fa58b5aedd..5fb32460e2 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.h
+++ b/src/gpu/cl/operators/ClDirectConv3d.h
@@ -67,7 +67,12 @@ public:
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
 
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -75,14 +80,18 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv3d_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
index 32d2b88798..1325371d19 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.cpp
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp
@@ -23,15 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseDivision::configure(const ClCompileContext    &compile_context,
+                                      ITensorInfo               *src1,
+                                      ITensorInfo               *src2,
+                                      ITensorInfo               *dst,
+                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -39,12 +42,19 @@ void ClElementwiseDivision::configure(const ClCompileContext &compile_context, I
     _kernel = std::move(k);
 }
 
-Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseDivision::validate(const ITensorInfo         *src1,
+                                       const ITensorInfo         *src2,
+                                       const ITensorInfo         *dst,
+                                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMax::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -52,12 +62,19 @@ void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITenso
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMax::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMin::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -65,12 +82,19 @@ void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITenso
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMin::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
 }
 
-void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseSquaredDiff::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *src1,
+                                         ITensorInfo               *src2,
+                                         ITensorInfo               *dst,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -78,12 +102,19 @@ void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context
     _kernel = std::move(k);
 }
 
-Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseSquaredDiff::validate(const ITensorInfo         *src1,
+                                          const ITensorInfo         *src2,
+                                          const ITensorInfo         *dst,
+                                          const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
 }
 
-void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwisePower::configure(const ClCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -91,7 +122,10 @@ void ClElementwisePower::configure(const ClCompileContext &compile_context, ITen
     _kernel = std::move(k);
 }
 
-Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwisePower::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
 }
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h
index 120049cb7f..de7c018d75 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.h
+++ b/src/gpu/cl/operators/ClElementwiseOperations.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -48,14 +49,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseDivision::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
@@ -74,14 +82,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMax::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
@@ -100,14 +115,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMin::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
@@ -126,14 +148,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseSquaredDiff::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
@@ -152,14 +181,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported:F16/F32.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwisePower::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
index f94d402c05..914621183e 100644
--- a/src/gpu/cl/operators/ClElementwiseUnary.cpp
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
index ad22b15cff..817b15ab20 100644
--- a/src/gpu/cl/operators/ClFill.cpp
+++ b/src/gpu/cl/operators/ClFill.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClFill.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFillKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+void ClFill::configure(const ClCompileContext &compile_context,
+                       ITensorInfo            *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window);
     auto k = std::make_unique<kernels::ClFillKernel>();
@@ -45,4 +47,4 @@ Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_va
     return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h
index 3bbe27ef71..e13862aa6b 100644
--- a/src/gpu/cl/operators/ClFill.h
+++ b/src/gpu/cl/operators/ClFill.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClFill::configure()
diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
index e277c0d7e4..7532532c94 100644
--- a/src/gpu/cl/operators/ClFlatten.cpp
+++ b/src/gpu/cl/operators/ClFlatten.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFlatten.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
index 84f685e381..6790160172 100644
--- a/src/gpu/cl/operators/ClFloor.cpp
+++ b/src/gpu/cl/operators/ClFloor.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFloor.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFloorKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
index 5845bbc69e..6969ac8ab3 100644
--- a/src/gpu/cl/operators/ClFullyConnected.cpp
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -24,12 +24,13 @@
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
 #include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -38,11 +39,8 @@
 #include "src/gpu/cl/operators/ClMatMul.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -62,8 +60,11 @@ inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src)
     return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation
 }
 
-Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
-                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       &src,
+                                       const ITensorInfo       &weights,
+                                       const ITensorInfo       &dst,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                       ActivationLayerInfo      activation_info)
 {
     gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
     gemmlowp_output_stage.gemmlowp_offset     = 0;
@@ -73,7 +74,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
     const auto data_type = src.data_type();
 
     // Configure output stage for quantized case
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const QuantizationInfo        oq_info = dst.quantization_info();
         const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
@@ -85,15 +86,17 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
         const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
         int         output_multiplier = 0;
         int         output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         PixelValue type_min{};
         PixelValue type_max{};
         std::tie(type_min, type_max) = get_min_max(data_type);
 
-        if(activation_info.enabled())
+        if (activation_info.enabled())
         {
-            std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
+            std::tie(type_min, type_max) =
+                get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
         }
 
         // Set the GEMMLowp output stage info
@@ -109,31 +112,41 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
     return Status{};
 }
 
-Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info, bool use_matmul)
+Status validate_mm(const ITensorInfo             &src,
+                   const ITensorInfo             &weights,
+                   const ITensorInfo             *bias,
+                   const ITensorInfo             &dst,
+                   const FullyConnectedLayerInfo &fc_info,
+                   bool                           use_matmul)
 {
     // Note : If input is dynamic and data is not batched, use matmul, else use gemm
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
-    const bool use_dynamic_gemm  = !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
-    const bool is_quantized      = is_data_type_quantized_asymmetric(src.data_type());
+    const bool use_dynamic_gemm =
+        !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
+    const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type());
 
-    if(use_matmul)
+    if (use_matmul)
     {
         const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights);
 
         // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1]
         TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape()));
 
-        const GPUTarget                                         gpu_target  = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t           = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        const MatMulKernelInfo                                  kernel_info = t->configure(&lhs_to_use, &weights, m_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info);
 
-        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info) :
-               kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info);
+        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst,
+                                                                          kernel_info, fc_info.activation_info)
+                            : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info,
+                                                                      fc_info.activation_info);
     }
     else
     {
         GEMMLowpOutputStageInfo gemmlowp_output_stage;
-        ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
 
         const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
                                              false,                           // is_b_reshaped
@@ -147,7 +160,7 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe
                                              true,                            // broadcast_bias
                                              ActivationLayerInfo());          // activation_info
 
-        if(is_quantized)
+        if (is_quantized)
         {
             const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
             const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
@@ -158,11 +171,9 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe
             const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
 
             // Validate gemmlowp function
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
-                                                                               &weights.clone()->set_quantization_info(weights_quantization_info),
-                                                                               bias,
-                                                                               &dst,
-                                                                               gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(
+                &src.clone()->set_quantization_info(src_quantization_info),
+                &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info));
         }
         else
         {
@@ -188,11 +199,15 @@ ClFullyConnected::ClFullyConnected()
 
 ClFullyConnected::~ClFullyConnected() = default;
 
-void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_mm(const CLCompileContext        &compile_context,
+                                    ITensorInfo                   *src,
+                                    ITensorInfo                   *weights,
+                                    ITensorInfo                   *bias,
+                                    ITensorInfo                   *dst,
                                     const FullyConnectedLayerInfo &fc_info)
 {
     // If weights are dynamic and matmul is supported use matmul, else use gemm
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Specify whether transpose weights is necessary in matmul info
         const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights);
@@ -202,22 +217,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
         _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape()));
 
         // 2. Use heuristics to get kernel info object
-        const GPUTarget                                         gpu_target    = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        MatMulKernelInfo                                        kernel_info   = kernel_config->configure(src, weights, mat_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info);
 
         // 3. Configure relevant matmul kernel
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>();
             _matmul_lowp_native_kernel->set_target(gpu_target);
-            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                                  fc_info.activation_info);
         }
         else
         {
             _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>();
             _matmul_native_kernel->set_target(gpu_target);
-            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                             fc_info.activation_info);
         }
     }
     else
@@ -238,7 +256,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
                                              true,                            // broadcast_bias
                                              fc_info.activation_info);        // activation_info
 
-        if(_is_quantized)
+        if (_is_quantized)
         {
             // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
             // Extract and negate input and weights offset
@@ -248,8 +266,10 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
             TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
             TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
 
-            src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
-            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+            src_info.set_quantization_info(
+                QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
+            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale,
+                                                                -weights_quantization_info.uniform().offset));
 
             // Configure gemmlowp function
             _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
@@ -264,16 +284,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
     }
 }
 
-void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_conv_fc(const CLCompileContext        &compile_context,
+                                         ITensorInfo                   *src,
+                                         ITensorInfo                   *weights,
+                                         ITensorInfo                   *bias,
+                                         ITensorInfo                   *dst,
                                          const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
-    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) !=
+                          (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for flatten
-    _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
+    _flattened_src = src->clone()
+                         ->set_is_resizable(true)
+                         .reset_padding()
+                         .set_tensor_shape(compute_flatten_shape(src))
+                         .set_data_layout(DataLayout::NCHW);
 
     // Configure flatten kernel
     _flatten = std::make_unique<ClFlatten>();
@@ -284,7 +313,11 @@ void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context
     configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_fc_fc(const CLCompileContext        &compile_context,
+                                       ITensorInfo                   *src,
+                                       ITensorInfo                   *weights,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
                                        const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
@@ -294,7 +327,11 @@ void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context,
     configure_mm(compile_context, src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClFullyConnected::configure(const CLCompileContext &compile_context,
+                                 ITensorInfo            *src,
+                                 ITensorInfo            *weights,
+                                 ITensorInfo            *biases,
+                                 ITensorInfo            *dst,
                                  FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -317,8 +354,9 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched.
     // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required)
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    _use_matmul                    = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
-    _dynamic_gemm                  = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
+    _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer &&
+                  !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+    _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -327,11 +365,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
     // Check if we have a fully connected layer with batches
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                  src->tensor_shape().cend(),
-                                                                                  dst->tensor_shape().cbegin() + 1));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -341,7 +379,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     ITensorInfo *weights_used = weights;
 
     // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op.
-    if(_transpose_weights && !_use_matmul)
+    if (_transpose_weights && !_use_matmul)
     {
         // Reshape the weights
         _reshape_weights = std::make_unique<ClTranspose>();
@@ -351,14 +389,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     }
 
     // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Convert weights
         _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
-        _convert_weights->configure(compile_context,
-                                    weights_used,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
+        _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(),
                                     fc_info.weights_trained_layout);
 
         weights_used         = &_converted_weights;
@@ -366,7 +401,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
         _run_convert_weights = true;
     }
 
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
@@ -379,60 +414,69 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
     _weights_to_use = *weights_used;
 
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem
-        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
+        _aux_mem[ConvertedWeights] =
+            MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
     }
     else
     {
         // Set auxiliary memory requirements for gemm operators
         auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-        for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+        for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
         {
             _aux_mem[i] = gemm_mem_req[i];
         }
-        if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
+        if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
             // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
             _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                             _converted_weights.total_size());
+                offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                    _converted_weights.total_size());
         }
         else
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-
-            _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
-                                             _converted_weights.total_size());
+            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+
+            _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                     _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
+                                                     _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
+                                                     _converted_weights.total_size());
         }
     }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
 
-Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status ClFullyConnected::validate(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  const ITensorInfo      *dst,
                                   FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
     const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
 
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
@@ -441,11 +485,20 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     // When using dynamic weights - use matmul kernels.
     // Note: MatMul does not support broadcasting so fallback with batched cases.
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
-
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) ? TensorInfo(*reshaped_weights.clone()) : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
+    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() &&
+                            !is_batched_fc_layer &&
+                            !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+
+    const ITensorInfo &flatten_src      = TensorInfo(src->clone()
+                                                         ->set_is_resizable(true)
+                                                         .reset_padding()
+                                                         .set_tensor_shape(compute_flatten_shape(src))
+                                                         .set_data_layout(DataLayout::NCHW));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul)
+                                               ? TensorInfo(*reshaped_weights.clone())
+                                               : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -456,10 +509,10 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     const ITensorInfo *src_to_use     = src;
     const ITensorInfo *weights_to_use = weights;
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -470,11 +523,11 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Check if FC is after conv (flatten kernel is run in case where FC is after conv.)
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                 src->tensor_shape().cend(),
-                                                                                 dst->tensor_shape().cbegin() + 1));
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -482,29 +535,28 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Transpose kernel does not run when matmul is supported as matmul fuses transpose op.
-    if(transpose_weights && !use_matmul)
+    if (transpose_weights && !use_matmul)
     {
         // Validate reshape weights kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
         weights_to_use = &reshaped_weights;
     }
 
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                             &converted_weights,
-                                                                             src->tensor_shape(),
-                                                                             fc_info.weights_trained_layout));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
         weights_to_use = &converted_weights;
     }
 
-    if(is_fc_after_conv)
+    if (is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
         const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
         // Validate flatten kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
@@ -539,24 +591,24 @@ void ClFullyConnected::run(ITensorPack &tensors)
     CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
 
     // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
         _flatten->run(flatten_pack);
     }
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_weights_to_use_idx != ACL_SRC_1)
+    if (_weights_to_use_idx != ACL_SRC_1)
     {
         gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
     }
 
     // Run MatMul Op
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Run matmul kernels for matrix multiplication
-        if(_is_quantized)
+        if (_is_quantized)
         {
             CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true);
         }
@@ -568,7 +620,7 @@ void ClFullyConnected::run(ITensorPack &tensors)
     else
     {
         // Run matrix multiply
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _mm_gemmlowp->run(gemm_pack);
         }
@@ -582,7 +634,7 @@ void ClFullyConnected::run(ITensorPack &tensors)
 void ClFullyConnected::prepare(ITensorPack &tensors)
 {
     // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed.
-    if(!_is_prepared || _dynamic_gemm)
+    if (!_is_prepared || _dynamic_gemm)
     {
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
         ++_asrt_prepare_count;
@@ -598,10 +650,10 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         const ITensor *cur_weights = weights;
 
         // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose.
-        if(_transpose_weights && !_use_matmul)
+        if (_transpose_weights && !_use_matmul)
         {
             // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
             _reshape_weights->run(transpose_pack);
 
             cur_weights->mark_as_unused();
@@ -609,9 +661,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         }
 
         // Convert weights if needed
-        if(_run_convert_weights)
+        if (_run_convert_weights)
         {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
@@ -622,9 +674,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
-        if(_dynamic_gemm || !_use_matmul)
+        if (_dynamic_gemm || !_use_matmul)
         {
-            if(!_is_quantized)
+            if (!_is_quantized)
             {
                 _mm_gemm->prepare(gemm_pack);
             }
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
index d975859d87..0621238ab5 100644
--- a/src/gpu/cl/operators/ClFullyConnected.h
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -47,7 +47,7 @@ namespace kernels
 {
 class ClMatMulNativeKernel;
 class ClMatMulLowpNativeKernel;
-}
+} // namespace kernels
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
  *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
@@ -88,7 +88,11 @@ public:
      *                             Data type supported: Same as @p src.
      * @param[in]  fc_info         (Optional) Fully connected layer additional info
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -96,18 +100,36 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
                            FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
 
     // Inherited methods overriden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
+    void configure_fc_fc(const CLCompileContext        &compile_context,
+                         ITensorInfo                   *src,
+                         ITensorInfo                   *weights,
+                         ITensorInfo                   *bias,
+                         ITensorInfo                   *dst,
+                         const FullyConnectedLayerInfo &fc_info);
+    void configure_conv_fc(const CLCompileContext        &compile_context,
+                           ITensorInfo                   *src,
+                           ITensorInfo                   *weights,
+                           ITensorInfo                   *bias,
+                           ITensorInfo                   *dst,
+                           const FullyConnectedLayerInfo &fc_info);
+    void configure_mm(const CLCompileContext        &compile_context,
+                      ITensorInfo                   *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *bias,
+                      ITensorInfo                   *dst,
+                      const FullyConnectedLayerInfo &fc_info);
 
 private:
     enum AuxTensorIdx
@@ -134,19 +156,19 @@ private:
     TensorInfo _reshaped_weights{};
     TensorInfo _lhs_to_use{};
     TensorInfo _weights_to_use{};
-    int        _weights_to_use_idx{ ACL_SRC_1 };
+    int        _weights_to_use_idx{ACL_SRC_1};
 
-    bool _run_convert_weights{ false };
-    bool _transpose_weights{ false };
-    bool _dynamic_gemm{ false };
-    bool _use_matmul{ false };
+    bool _run_convert_weights{false};
+    bool _transpose_weights{false};
+    bool _dynamic_gemm{false};
+    bool _use_matmul{false};
 
-    bool _is_fc_after_conv{ true };
-    bool _is_quantized{ false };
-    bool _is_prepared{ false };
+    bool _is_fc_after_conv{true};
+    bool _is_quantized{false};
+    bool _is_prepared{false};
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-    int _asrt_run_count {};
+    int _asrt_run_count{};
     int _asrt_prepare_count{};
 #endif // ARM_COMPUTE_ASSERTS_ENABLED
 };
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 7e331a86f3..815c254c69 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -33,11 +33,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -45,8 +46,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 #include "utils/TypePrinter.h"
 
@@ -67,35 +66,43 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
     return kernel_type == CLGEMMKernelType::NATIVE ? false : true;
 }
 //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
+inline CLGEMMKernelType
+auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
 {
-    if(!constant_weights)
+    if (!constant_weights)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                                    const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *c,
+                                                    const ITensorInfo       *output,
+                                                    GEMMKernelInfo           gemm_kernel_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -103,12 +110,14 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     gemm_kernel_info.lhs_info  = lhs_info;
     gemm_kernel_info.rhs_info  = rhs_info;
     gemm_kernel_info.has_pad_y = false;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
     gemm_kernel_info.has_pad_y = true;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -116,49 +125,65 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
-                                                                                                 const ITensorInfo *b,
-                                                                                                 const ITensorInfo *c, const ITensorInfo *output)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          GEMMKernelInfo               kernel_info,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *c,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped kernel
-inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                           const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
+inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info,
+                                           const GEMMRHSMatrixInfo &rhs_info,
+                                           const ITensorInfo       *a,
+                                           const ITensorInfo       *b,
+                                           const ITensorInfo       *c,
+                                           const ITensorInfo       *output,
+                                           GEMMKernelInfo           gemm_kernel_info,
+                                           bool                     reinterpret_input_as_3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Validate reshape LHS kernel
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
-    if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
+    auto_init_if_empty(tmp_a_info,
+                       a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
+    if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
     {
         return false;
     }
 
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
     // Validate mm kernel
     gemm_kernel_info.lhs_info = lhs_info;
     gemm_kernel_info.rhs_info = rhs_info;
-    if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                           rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -166,21 +191,32 @@ inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, co
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
-                                                                                        const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query,
+                                 GEMMKernelInfo               kernel_info,
+                                 const ITensorInfo           *a,
+                                 const ITensorInfo           *b,
+                                 const ITensorInfo           *c,
+                                 const ITensorInfo           *output,
+                                 bool                         reinterpret_input_as_3d)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
+        if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info,
+                                           reinterpret_input_as_3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+        to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 } // namespace
 
@@ -200,18 +236,24 @@ ClGemm::ClGemm()
 {
 }
 
-void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                              const GEMMInfo &gemm_info)
+void ClGemm::configure_native(const CLCompileContext &compile_context,
+                              ITensorInfo            *a,
+                              ITensorInfo            *b,
+                              ITensorInfo            *c,
+                              ITensorInfo            *output,
+                              float                   alpha,
+                              float                   beta,
+                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -225,24 +267,32 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Configure and tune matrix multiply kernel
-    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info);
+    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info,
+                                 kernel_info);
 }
 
-void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped(const CLCompileContext &compile_context,
+                                ITensorInfo            *a,
+                                ITensorInfo            *b,
+                                ITensorInfo            *c,
+                                ITensorInfo            *output,
+                                float                   alpha,
+                                float                   beta,
+                                const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -261,32 +311,42 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,
-                                                                    c, output, gemm_info.reinterpret_input_as_3d());
+    std::tie(lhs_info, rhs_info) =
+        auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size},
+                                         kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d());
 
     _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                   kernel_info);
 
     // Request memory for LHS and RHS reshape matrix
     _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                         const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                         ITensorInfo            *a,
+                                         ITensorInfo            *b,
+                                         ITensorInfo            *c,
+                                         ITensorInfo            *output,
+                                         float                   alpha,
+                                         float                   beta,
+                                         const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -304,7 +364,8 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);
+    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output);
 
     // Transpose matrix
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
@@ -315,24 +376,33 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                            kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                              const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                              ITensorInfo            *a,
+                                              ITensorInfo            *b,
+                                              ITensorInfo            *c,
+                                              ITensorInfo            *output,
+                                              float                   alpha,
+                                              float                   beta,
+                                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -350,9 +420,10 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info         = gemm_config.lhs_info;
-    rhs_info         = gemm_config.rhs_info;
+    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -361,13 +432,22 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info,
+                                                 rhs_info, kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_native(const ITensorInfo *a,
+                               const ITensorInfo *b,
+                               const ITensorInfo *c,
+                               const ITensorInfo *output,
+                               float              alpha,
+                               float              beta,
+                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -376,12 +456,12 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -392,15 +472,23 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(
+        a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped(const ITensorInfo *a,
+                                 const ITensorInfo *b,
+                                 const ITensorInfo *c,
+                                 const ITensorInfo *output,
+                                 float              alpha,
+                                 float              beta,
+                                 const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -412,12 +500,12 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -433,23 +521,33 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config =
+        select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(
+                                       compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha,
+                                                                             beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a,
+                                          const ITensorInfo *b,
+                                          const ITensorInfo *c,
+                                          const ITensorInfo *output,
+                                          float              alpha,
+                                          float              beta,
+                                          const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -460,12 +558,12 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -481,24 +579,33 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     kernel_info.has_pad_y = true;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               float              alpha,
+                                               float              beta,
+                                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -508,12 +615,12 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -529,9 +636,10 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -540,12 +648,20 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void ClGemm::configure(const CLCompileContext &compile_context,
+                       ITensorInfo            *a,
+                       ITensorInfo            *b,
+                       ITensorInfo            *c,
+                       ITensorInfo            *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -558,20 +674,21 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a,
     _is_prepared                 = gemm_info.retain_internal_weights();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Select GEMMType
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,
-                                                b->are_values_constant());
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size},
+        _reshape_b_only_on_first_run, b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -600,35 +717,41 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a,
     }
 }
 
-Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     // Get the GPU target
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Check data type early because the auto_select_gemm_kernel has assertions on supported data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
 
     // Select GEMMType
-    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
-    {
-        CLScheduler::get().target(),
-        a->data_type(),
-        m,
-        n,
-        k,
-        batch_size,
-    },
-    gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
+    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{
+            CLScheduler::get().target(),
+            a->data_type(),
+            m,
+            n,
+            k,
+            batch_size,
+        },
+        gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(gemm_kernel_type)
+    switch (gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -647,7 +770,8 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         default:
@@ -674,7 +798,7 @@ void ClGemm::run(ITensorPack &tensors)
     prepare(tensors);
 
     // Run matrix multiply kernel
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -684,13 +808,13 @@ void ClGemm::run(ITensorPack &tensors)
         case CLGEMMKernelType::RESHAPED:
         {
             // Run interleave kernel
-            ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };
+            ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts
@@ -698,7 +822,7 @@ void ClGemm::run(ITensorPack &tensors)
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get());
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
             {
                 CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
             }
@@ -706,10 +830,10 @@ void ClGemm::run(ITensorPack &tensors)
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -722,7 +846,7 @@ void ClGemm::run(ITensorPack &tensors)
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -734,10 +858,10 @@ void ClGemm::run(ITensorPack &tensors)
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -750,7 +874,7 @@ void ClGemm::run(ITensorPack &tensors)
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -769,20 +893,22 @@ void ClGemm::run(ITensorPack &tensors)
 
 void ClGemm::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        const ITensor *src1    = constants.get_const_tensor(ACL_SRC_1);
-        ICLTensor     *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
+        const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
+        ICLTensor     *rhs_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
 
         // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
-        if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
+        if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) &&
+            (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
         {
             ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
 
             CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
             ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
 
-            ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };
+            ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
         }
         _is_prepared = true;
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
index 11f9f2b3d8..85dc1d6c8f 100644
--- a/src/gpu/cl/operators/ClGemm.h
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -90,30 +90,95 @@ public:
      *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
      *                             in case matrix A and matrix B have been already transformed.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemm::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_native(const CLCompileContext &compile_context,
+                          ITensorInfo            *a,
+                          ITensorInfo            *b,
+                          ITensorInfo            *c,
+                          ITensorInfo            *output,
+                          float                   alpha,
+                          float                   beta,
+                          const GEMMInfo         &gemm_info);
+    void configure_reshaped(const CLCompileContext &compile_context,
+                            ITensorInfo            *a,
+                            ITensorInfo            *b,
+                            ITensorInfo            *c,
+                            ITensorInfo            *output,
+                            float                   alpha,
+                            float                   beta,
+                            const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                     ITensorInfo            *a,
+                                     ITensorInfo            *b,
+                                     ITensorInfo            *c,
+                                     ITensorInfo            *output,
+                                     float                   alpha,
+                                     float                   beta,
+                                     const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                          ITensorInfo            *a,
+                                          ITensorInfo            *b,
+                                          ITensorInfo            *c,
+                                          ITensorInfo            *output,
+                                          float                   alpha,
+                                          float                   beta,
+                                          const GEMMInfo         &gemm_info);
 
-    static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_native(const ITensorInfo *a,
+                                  const ITensorInfo *b,
+                                  const ITensorInfo *c,
+                                  const ITensorInfo *output,
+                                  float              alpha,
+                                  float              beta,
+                                  const GEMMInfo    &gemm_info);
+    static Status validate_reshaped(const ITensorInfo *a,
+                                    const ITensorInfo *b,
+                                    const ITensorInfo *c,
+                                    const ITensorInfo *output,
+                                    float              alpha,
+                                    float              beta,
+                                    const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a,
+                                             const ITensorInfo *b,
+                                             const ITensorInfo *c,
+                                             const ITensorInfo *output,
+                                             float              alpha,
+                                             float              beta,
+                                             const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                                  const ITensorInfo *b,
+                                                  const ITensorInfo *c,
+                                                  const ITensorInfo *output,
+                                                  float              alpha,
+                                                  float              beta,
+                                                  const GEMMInfo    &gemm_info);
 
 private:
     enum AuxTensorIdx
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 5620471ff9..55d815a1ef 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -28,10 +28,12 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -41,8 +43,6 @@
 #include "src/gpu/cl/operators/ClGemm.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -53,18 +53,38 @@ using namespace utils::cast;
 namespace opencl
 {
 ClGemmConv2d::ClGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
-      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _col2im_kernel(nullptr),
+      _activation_kernel(nullptr),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _fuse_activation(true),
+      _append_bias(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
 {
 }
 ClGemmConv2d::~ClGemmConv2d() = default;
 
-void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClGemmConv2d::configure_mm(const ClCompileContext        &compile_context,
+                                const ITensorInfo             *src,
+                                ITensorInfo                   *weights,
+                                ITensorInfo                   *biases,
+                                ITensorInfo                   *dst,
                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                int gemm_3d_depth, const ActivationLayerInfo &act_info)
+                                int                            gemm_3d_depth,
+                                const ActivationLayerInfo     &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
 
     const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
                                          false,                 // is_b_reshaped
@@ -77,18 +97,20 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
                                          act_info               // activation_info
-                                        );
+    );
 
-    TensorInfo tmp_src{ *src };
-    if(_is_quantized)
+    TensorInfo tmp_src{*src};
+    if (_is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
         const QuantizationInfo weights_quantization_info = weights->quantization_info();
 
-        tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        tmp_src.set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
         _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
@@ -97,7 +119,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
         weights->set_quantization_info(weights_quantization_info);
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
@@ -108,15 +130,21 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
         _mm_gemm = std::make_unique<ClGemm>();
         _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
         auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
 }
 
-Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
+Status ClGemmConv2d::validate_mm(const ITensorInfo             *src,
+                                 const ITensorInfo             *weights,
+                                 const ITensorInfo             *biases,
+                                 const ITensorInfo             *dst,
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                 int                            gemm_3d_depth,
+                                 bool                           skip_im2col,
+                                 const ActivationLayerInfo     &act_info)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
 
@@ -131,9 +159,9 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
                                          act_info               // activation_info
-                                        );
+    );
 
-    if(is_quantized)
+    if (is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
@@ -142,8 +170,10 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
 
         std::unique_ptr<ITensorInfo> src_qa     = src->clone();
         std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        src_qa->set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights_qa->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Perform validation step on GEMMLowp
         return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
@@ -155,14 +185,17 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
     }
 }
 
-void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                             const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info)
+void ClGemmConv2d::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *src,
+                             ITensorInfo            *weights,
+                             ITensorInfo            *biases,
+                             ITensorInfo            *dst,
+                             const Conv2dInfo       &conv2d_info,
+                             const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst,
-                                                      conv2d_info,
-                                                      weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
     const DataType   data_type   = src->data_type();
@@ -180,7 +213,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
 
     _is_prepared  = weights_info.retain_internal_weights();
     _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
     _skip_col2im  = data_layout == DataLayout::NHWC;
 
     // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
@@ -197,12 +231,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     // Get convolved dimensions
     unsigned int conv_w      = 0;
     unsigned int conv_h      = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
@@ -210,28 +240,31 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     _append_bias               = false;
 
     _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
         biases_to_use = nullptr;
         _append_bias  = true;
-        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
     else
     {
-        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
 
     // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         // Configure and tune im2col. im2col output shape is auto-initialized
         _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
 
         // Set the GPU target for im2col
         _im2col_kernel->set_target(CLScheduler::get().target());
-        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
+        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height),
+                                  conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
 
         // Set quantization info
         _im2col_output.set_quantization_info(src->quantization_info());
@@ -242,7 +275,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     }
 
     // Create GEMM output tensor
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -263,7 +296,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     gemmlowp_output_stage.gemmlowp_offset = 0;
 
     // Configure output stage for quantized case
-    if(_is_quantized)
+    if (_is_quantized)
     {
         const auto         output_quant_info        = (dst->total_size() == 0) ? iq_info : oq_info;
         const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
@@ -286,16 +319,16 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
         auto min_activation = min_val.get<int32_t>();
         auto max_activation = max_val.get<int32_t>();
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -313,48 +346,60 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use,
+                 gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
 
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         // Set the GPU target for col2im
         _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
         _col2im_kernel->set_target(CLScheduler::get().target());
         // Configure and tune Col2Im
-        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups);
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h),
+                                  conv2d_info.num_groups);
         CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
     }
 
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    if(!_fuse_activation)
+    if (!_fuse_activation)
     {
         _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
         _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
     }
 
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] =
+        MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
-Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClGemmConv2d::validate(const ITensorInfo *src,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *dst,
+                              const Conv2dInfo  &conv2d_info,
                               const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
 
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8),
+                                    "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) &&
+                                (src->data_layout() == DataLayout::NCHW));
 
     const DataLayout data_layout = src->data_layout();
     const DataType   data_type   = src->data_type();
@@ -374,18 +419,19 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     const ITensorInfo *gemm_output_to_use = dst;
     const ITensorInfo *weights_to_use     = weights;
     const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
-    const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
-                                             && conv2d_info.conv_info.stride().second == 1);
-    const bool         skip_col2im        = data_layout == DataLayout::NHWC;
-    bool               fuse_activation    = true;
+    const bool         skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    const bool         skip_col2im     = data_layout == DataLayout::NHWC;
+    bool               fuse_activation = true;
 
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) !=
+                                src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -397,7 +443,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    if(conv2d_info.act_info.enabled())
+    if (conv2d_info.act_info.enabled())
     {
         ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
     }
@@ -406,48 +452,50 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
     const ITensorInfo *biases_to_use = biases;
     bool               append_bias   = false;
 
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use         = nullptr;
-        append_bias           = true;
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
+        biases_to_use = nullptr;
+        append_bias   = true;
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
     }
     else
     {
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
     }
 
     weights_to_use = &weights_reshaped_info;
 
-    if(!skip_im2col)
+    if (!skip_im2col)
     {
         const Size2D kernel_dims(kernel_width, kernel_height);
 
         // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups);
+        TensorShape expected_output_shape =
+            compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation,
+                                      conv2d_info.num_groups == 1, conv2d_info.num_groups);
 
         auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info,
+                                                      append_bias, conv2d_info.dilation, conv2d_info.num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
     // Create GEMM output tensor
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -465,7 +513,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     gemmlowp_output_stage.gemmlowp_offset          = 0;
     gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info           = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info           = dst->quantization_info().uniform();
@@ -483,16 +531,16 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
         int min_activation = 0;
         int max_activation = 0;
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -509,16 +557,18 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use,
+                                            gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
 
     // Validate Col2Im
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
     }
 
     // Validate Activation Layer
-    if(!fuse_activation)
+    if (!fuse_activation)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
     }
@@ -541,30 +591,26 @@ void ClGemmConv2d::run(ITensorPack &tensors)
     CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     // Run im2col
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
         CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
         gemm_input_to_use = im2col_output.get();
     }
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         gemm_output_to_use = gemm_output.get();
     }
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
     pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
-    if(!_append_bias)
+    if (!_append_bias)
     {
         pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
     }
     pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
     // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // Run gemmlowp
         _mm_gemmlowp->run(pack_mm);
@@ -576,43 +622,32 @@ void ClGemmConv2d::run(ITensorPack &tensors)
     }
 
     // Reshape output matrix
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
-    if(!_fuse_activation)
+    if (!_fuse_activation)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
     }
 }
 
 void ClGemmConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Run weights reshaping and mark original weights tensor as unused
-        ICLTensor         *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+        ICLTensor *weights_reshaped_p =
+            utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
         CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
         auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack        pack    =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
+        ITensorPack        pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
 
-        if(_append_bias)
+        if (_append_bias)
         {
             const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
             pack.add_const_tensor(TensorType::ACL_BIAS, biases);
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index 8a46ee2dc3..e8f3147ac3 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -100,15 +101,24 @@ public:
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
      *                             tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
@@ -130,9 +140,14 @@ private:
      * @param[in]      gemm_3d_depth         Depth of GEMM 3D
      * @param[in]      act_info              Activation to apply after the matrix multiplication
      */
-    void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure_mm(const CLCompileContext        &compile_context,
+                      const ITensorInfo             *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *biases,
+                      ITensorInfo                   *dst,
                       const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info);
+                      int                            gemm_3d_depth,
+                      const ActivationLayerInfo     &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -148,8 +163,14 @@ private:
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
+    static Status validate_mm(const ITensorInfo             *src,
+                              const ITensorInfo             *weights,
+                              const ITensorInfo             *biases,
+                              const ITensorInfo             *dst,
+                              const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                              int                            gemm_3d_depth,
+                              bool                           skip_im2col,
+                              const ActivationLayerInfo     &act_info);
 
     enum AuxTensorIdx
     {
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 2622274587..71c247de79 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -52,7 +52,7 @@ namespace
 {
 inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
@@ -71,32 +71,41 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
 inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
 {
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 
 // Validate lhs_info and rhs_info for native kernel
-inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info,
+                                         const GEMMRHSMatrixInfo &rhs_info,
+                                         const ITensorInfo       *a,
+                                         const ITensorInfo       *b,
+                                         const GEMMReshapeInfo   &reshape_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo mm_result_s32_info{};
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
     // Validate mm kernel
     // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
     // NOTE: This assumes:
     //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
     //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
-    if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info,
+                                                             reshape_info)))
     {
         return false;
     }
@@ -104,31 +113,45 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query,
+                                                                               const ITensorInfo           *a,
+                                                                               const ITensorInfo           *b,
+                                                                               const GEMMReshapeInfo &reshape_info)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
+        if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_native(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ",
+                                              to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                    unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *output,
+                                                    unsigned int             m,
+                                                    unsigned int             n,
+                                                    unsigned int             k,
+                                                    bool                     reinterpret_input_as_3d,
+                                                    int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -148,7 +171,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                      gemm_kernel_info)))
     {
         return false;
     }
@@ -156,14 +180,22 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                         unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info,
+                                                         const GEMMRHSMatrixInfo &rhs_info,
+                                                         const ITensorInfo       *a,
+                                                         const ITensorInfo       *b,
+                                                         const ITensorInfo       *output,
+                                                         unsigned int             m,
+                                                         unsigned int             n,
+                                                         unsigned int             k,
+                                                         bool                     reinterpret_input_as_3d,
+                                                         int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -183,7 +215,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                          gemm_kernel_info)))
     {
         return false;
     }
@@ -191,40 +224,55 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                          const ITensorInfo *a,
-                                                                                          const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          bool                         reinterpret_input_as_3d,
+                                          int                          depth_output_gemm3d,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                    query.k, reinterpret_input_as_3d, depth_output_gemm3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                               const ITensorInfo *a,
-                                                                                               const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query,
+                                               bool                         reinterpret_input_as_3d,
+                                               int                          depth_output_gemm3d,
+                                               const ITensorInfo           *a,
+                                               const ITensorInfo           *b,
+                                               const ITensorInfo           *output)
 {
     ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
     auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
-                                              to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                 query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
             return false;
@@ -254,8 +302,11 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
 ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
 
 void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
-                                             ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
-                                             const GEMMInfo &gemm_info)
+                                             ITensorInfo            *a,
+                                             ITensorInfo            *b,
+                                             ITensorInfo            *c,
+                                             ITensorInfo            *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
@@ -263,8 +314,8 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->quantization_info().uniform().offset;
-    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                                   && a->data_type() == DataType::QASYMM8;
+    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) &&
+                          is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8;
     _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
     _gemm_info = gemm_info;
 
@@ -282,17 +333,18 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     // Arguments used by GEMMReshapeInfo
     // in order to know how the matrices have been reshaped
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
     const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run);
 
-    if(_convert_to_qasymm8)
+    if (_convert_to_qasymm8)
     {
         // Set data type for converted weights
         _qasymm8_weights = *b;
@@ -301,47 +353,50 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     }
 
     ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                 depth_output_gemm3d,
-                                                                                 a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                      depth_output_gemm3d,
-                                                                                      a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
 
     // Using default reduction info
-    const GEMMLowpReductionKernelInfo reduction_info {};
+    const GEMMLowpReductionKernelInfo reduction_info{};
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
+    if (_a_offset != 0)
     {
         _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b,
+                                           &_vector_sum_col, reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
         _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
@@ -360,17 +415,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     gemm_kernel_info.a_offset                = _a_offset;
     gemm_kernel_info.b_offset                = _b_offset;
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         // Configure offset contribution kernel
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
         _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
         _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
-        if(num_filters == 1)
+        if (num_filters == 1)
         {
             // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
             // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
@@ -379,55 +436,67 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
 
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS &&
+            gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL &&
+                 gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                         _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_mmul_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
         {
             _run_output_stage = true;
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
             {
-                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                        gemm_kernel_info);
             }
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
             {
-                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                             gemm_kernel_info);
             }
             else
             {
                 // Pick up the GEMM configuration
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                              a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
+                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                    _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
-
-                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                                                    c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
-                                                                    &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info,
+                                             reshape_info);
+
+                _offset_contribution_output_stage_kernel->configure(
+                    compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0),
+                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers,
+                    &_gemm_output_stage_shifts);
             }
         }
     }
     else
     {
         _run_offset_contribution = true;
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
@@ -436,44 +505,65 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                          a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
+            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
 
             // Configure matrix multiply kernel
             _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                               c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                                               a->dimension(0), _a_offset, _b_offset);
     }
 
     // Request memory
-    _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    _aux_mem[RhsQAsymm8] =
+        MemoryInfo(offset_int_vec(RhsQAsymm8),
+                   _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                   _qasymm8_weights.total_size());
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
-        _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-        _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    }
-    if(_a_offset != 0)
-    {
-        _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
-    }
-    if(_b_offset != 0)
-    {
-        _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    }
-    _aux_mem[ResultS32]   = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
-    _aux_mem[Shifts]      = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
+        _aux_mem[RhsQAsymm8] =
+            MemoryInfo(offset_int_vec(RhsQAsymm8),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary,
+                       _qasymm8_weights.total_size());
+        _aux_mem[RhsReshape] = MemoryInfo(
+            offset_int_vec(RhsReshape),
+            _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    }
+    if (_a_offset != 0)
+    {
+        _aux_mem[VecSumCol] =
+            MemoryInfo(offset_int_vec(VecSumCol),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                       _vector_sum_col.total_size());
+    }
+    if (_b_offset != 0)
+    {
+        _aux_mem[VecSumRow] =
+            MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    }
+    _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent,
+                                       _gemm_output_stage_multipliers.total_size());
+    _aux_mem[Shifts] =
+        MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
 }
 
-Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
@@ -492,39 +582,44 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
+    bool reshape_matrix_b = is_gemm_reshaped(
+        auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size},
+                                gemm_info.reshape_b_only_on_first_run()));
 
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                              && is_data_type_quantized_asymmetric(a->data_type());
+    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) &&
+                              is_data_type_quantized_symmetric(b->data_type()) &&
+                              is_data_type_quantized_asymmetric(a->data_type());
     TensorInfo weights_info(*b);
-    if(convert_to_qasymm8)
+    if (convert_to_qasymm8)
     {
         b_offset = -128;
         weights_info.set_data_type(DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
     }
     const ITensorInfo *matrix_b_info = &weights_info;
-    if(reshape_matrix_b)
+    if (reshape_matrix_b)
     {
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
         // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-        lhs_info       = res.lhs_info;
-        rhs_info       = res.rhs_info;
+        const auto res = select_default_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+        lhs_info = res.lhs_info;
+        rhs_info = res.rhs_info;
 
         // Validate reshape RHS kernel
-        auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+        auto_init_if_empty(tmp_b_info,
+                           weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
     }
 
@@ -533,21 +628,23 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
 
     const GEMMLowpReductionKernelInfo reduction_info;
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
     }
 
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
         // Configure matrix A reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -560,92 +657,99 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     gemm_kernel_info.rhs_info                = rhs_info;
     gemm_kernel_info.a_offset                = a_offset;
     gemm_kernel_info.b_offset                = b_offset;
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
-        const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+        const TensorInfo gemm_output_stage_multipliers_shifts_info(
+            TensorInfo(TensorShape(num_filters), 1, DataType::S32));
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
-        if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (reshape_matrix_b &&
+            gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info,
+                &gemm_output_stage_multipliers_shifts_info));
         }
         else
         {
             TensorInfo mm_result_s32_info{};
 
-            if(reshape_matrix_b)
+            if (reshape_matrix_b)
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
             }
             else
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, false, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Pick up the GEMM configuration
                 // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-                lhs_info       = res.lhs_info;
-                rhs_info       = res.rhs_info;
+                const auto res = select_default_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+                lhs_info = res.lhs_info;
+                rhs_info = res.rhs_info;
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                output,
-                                                                                                a_offset, b_offset,
-                                                                                                gemmlowp_output_stage,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage,
+                &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info));
         }
     }
     else
     {
-        if(reshape_matrix_b)
+        if (reshape_matrix_b)
         {
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info));
         }
         else
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-            lhs_info       = res.lhs_info;
-            rhs_info       = res.rhs_info;
+            const auto res = select_default_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+            lhs_info = res.lhs_info;
+            rhs_info = res.rhs_info;
 
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
 
-        if(output->total_size() != 0)
+        if (output->total_size() != 0)
         {
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,
-                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                     c,
-                                                                                     a_offset, b_offset));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                c, a_offset, b_offset));
         }
     }
 
@@ -675,73 +779,61 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
 
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         matrix_b = tmp_b.get();
-        if(!_reshape_b_only_on_first_run)
+        if (!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
-            ITensorPack mtx_b_reshape_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                              {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
         }
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        ITensorPack mtx_b_red_pack =
-        {
-            { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-            { TensorType::ACL_DST, vec_sum_col.get() }
-        };
+        ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
-        ITensorPack mtx_a_red_pack =
-        {
-            { TensorType::ACL_SRC, matrix_a },
-            { TensorType::ACL_DST, vec_sum_row.get() }
-        };
+        ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}};
         CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
     }
 
     // Run matrix multiply
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         ITensorPack gemm_reshaped_pack;
-        if(_run_offset_contribution)
+        if (_run_offset_contribution)
         {
-            gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
-            });
+            gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a},
+                                              {TensorType::ACL_SRC_1, matrix_b},
+                                              {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}});
         }
         else
         {
-            gemm_reshaped_pack = ITensorPack(
-            {
-                { TensorType::ACL_SRC, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_BIAS, c },
-                { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-                { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-                { TensorType::ACL_SHIFTS, shifts.get() },
-                { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-                { TensorType::ACL_DST, dst },
+            gemm_reshaped_pack = ITensorPack({
+                {TensorType::ACL_SRC, matrix_a},
+                {TensorType::ACL_SRC_1, matrix_b},
+                {TensorType::ACL_BIAS, c},
+                {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+                {TensorType::ACL_SHIFTS, shifts.get()},
+                {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+                {TensorType::ACL_DST, dst},
             });
         }
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
         }
@@ -752,46 +844,39 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     }
     else
     {
-        ITensorPack gemm_native_pack =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b },
-            { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
-        };
+        ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a},
+                                        {TensorType::ACL_SRC_1, matrix_b},
+                                        {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}};
         CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
     }
-    if(_run_output_stage)
+    if (_run_output_stage)
     {
         // Run offset contribution/output stage kernel
-        ITensorPack output_stage_pack =
-        {
-            { TensorType::ACL_SRC, res32.get() },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-            { TensorType::ACL_SHIFTS, shifts.get() },
-            { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-            { TensorType::ACL_DST, dst },
+        ITensorPack output_stage_pack = {
+            {TensorType::ACL_SRC, res32.get()},
+            {TensorType::ACL_BIAS, c},
+            {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+            {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+            {TensorType::ACL_SHIFTS, shifts.get()},
+            {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+            {TensorType::ACL_DST, dst},
         };
         CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
     }
-    if(_run_offset_contribution)
+    if (_run_offset_contribution)
     {
         // Run offset contribution kernel
-        ITensorPack offset_contrib_pack =
-        {
-            { TensorType::ACL_SRC_DST, dst },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
-        };
+        ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst},
+                                           {TensorType::ACL_BIAS, c},
+                                           {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                                           {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
     }
 }
 
 void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
@@ -800,56 +885,55 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 
         ARM_COMPUTE_ERROR_ON_NULLPTR(b);
 
-        if(_convert_to_qasymm8)
+        if (_convert_to_qasymm8)
         {
-            ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
+            ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}};
             CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
             b->mark_as_unused();
         }
 
-        if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
+        if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
         {
             // Run reshape kernel and mark original weights tensor as unused
-            ITensorPack mtx_b_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
             b->mark_as_unused();
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        if (_a_offset != 0 && _reshape_b_only_on_first_run)
         {
-            ITensorPack mtx_b_red_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, vec_sum_col.get() }
-            };
+            ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                          {TensorType::ACL_DST, vec_sum_col.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
         }
 
         // Compute GEMM output multipliers and shifts for output stage
         {
-            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                           ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                           : 1;
 
             CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
             CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
 
             ICLTensor *multiplier_tensor = multipliers.get();
-            if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
+            if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
             {
                 multiplier_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
+                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(),
+                            num_filters * sizeof(int32_t));
                 multiplier_tensor->unmap(CLScheduler::get().queue());
             }
 
             ICLTensor *shifts_tensor = shifts.get();
-            if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
+            if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
             {
                 shifts_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
                 shifts_tensor->unmap(CLScheduler::get().queue());
             }
         }
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
index 6e32a90fc4..c80dc3a182 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
@@ -93,18 +93,27 @@ public:
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -130,7 +139,7 @@ private:
     std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>                  _mtx_a_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>                  _mtx_b_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>                _offset_contribution_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel>     _offset_contribution_output_stage_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
 
     // Temporary tensors
     TensorInfo _qasymm8_weights{};
@@ -141,13 +150,13 @@ private:
     TensorInfo _gemm_output_stage_multipliers{};
     TensorInfo _gemm_output_stage_shifts{};
 
-    int32_t          _a_offset{ 0 };
-    int32_t          _b_offset{ 0 };
-    bool             _reshape_b_only_on_first_run{ false };
-    bool             _run_output_stage{ false };
-    bool             _convert_to_qasymm8{ false };
-    bool             _run_offset_contribution{ false };
-    bool             _is_prepared{ false };
+    int32_t          _a_offset{0};
+    int32_t          _b_offset{0};
+    bool             _reshape_b_only_on_first_run{false};
+    bool             _run_output_stage{false};
+    bool             _convert_to_qasymm8{false};
+    bool             _run_offset_contribution{false};
+    bool             _is_prepared{false};
     GEMMInfo         _gemm_info{};
     CLGEMMKernelType _gemm_kernel_type{};
 
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
index a61b11a3b1..e3363e3685 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
@@ -27,22 +27,25 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void ClGemmLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ITensorInfo             *src,
+                                      const ITensorInfo             *bias,
+                                      ITensorInfo                   *dst,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
@@ -70,12 +73,16 @@ void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, c
     }
 }
 
-Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status ClGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *dst,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
             return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
@@ -94,7 +101,7 @@ void ClGemmLowpOutputStage::run(ITensorPack &tensors)
     const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
     ITensor       *dst  = tensors.get_tensor(ACL_DST);
 
-    ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } };
+    ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}};
     CLScheduler::get().enqueue_op(*_kernel, pack, true);
 }
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
index 3f1b04dcce..6357e0200b 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.h
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
@@ -71,14 +71,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpOutputStage::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp
index b900974574..777fc9e5e1 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp
@@ -27,16 +27,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
 #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h"
 #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
 
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::cl_indirect_conv;
 
 namespace arm_compute
@@ -47,7 +46,8 @@ using namespace arm_compute::experimental;
 
 namespace
 {
-DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -59,8 +59,13 @@ DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *
 
 } // namespace
 
-void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClIndirectConv2d::configure(const CLCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -86,25 +91,29 @@ void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITenso
     CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel);
 
     // Request memory for the indirect buffer
-    _aux_mem[IndirectBuffer] = MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
+    _aux_mem[IndirectBuffer] =
+        MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
 }
 
-Status ClIndirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClIndirectConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
 
-    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                         src->data_layout(),
-                                                                                         weights->tensor_shape(),
-                                                                                         conv_info,
-                                                                                         desc);
+    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(src, weights, &indirect_buffer, conv_info, desc));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(
+        src, weights, &indirect_buffer, conv_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst,
+                                                                          conv_info, act_info, desc));
 
     return Status{};
 }
@@ -124,9 +133,10 @@ void ClIndirectConv2d::run(ITensorPack &tensors)
 
 void ClIndirectConv2d::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        ICLTensor *indirect_buffer_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
+        ICLTensor *indirect_buffer_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
         ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr);
 
         ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer");
@@ -134,7 +144,7 @@ void ClIndirectConv2d::prepare(ITensorPack &constants)
         CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux);
         ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr);
 
-        ITensorPack indirect_buffer_pack{ { ACL_DST, indirect_buffer.get() } };
+        ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}};
         CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true);
 
         _is_prepared = true;
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h
index e50fa25069..29e796efd9 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.h
+++ b/src/gpu/cl/operators/ClIndirectConv2d.h
@@ -77,7 +77,12 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -85,12 +90,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -100,11 +109,11 @@ private:
         Count
     };
 
-    std::unique_ptr<IClKernel>       _indirect_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{ nullptr };
+    std::unique_ptr<IClKernel>       _indirect_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{nullptr};
     TensorInfo                       _indirect_buffer{};
-    bool                             _is_prepared{ false };
-    experimental::MemoryRequirements _aux_mem{ Count };
+    bool                             _is_prepared{false};
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
index b2eb89b320..d8d4186d00 100644
--- a/src/gpu/cl/operators/ClLogicalNot.cpp
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 49d14127ca..c14b1f2992 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -47,11 +47,17 @@ ClMatMul::ClMatMul()
 {
 }
 
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status ClMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *dst,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
@@ -61,11 +67,16 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
 
     const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
 
-    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) :
-           ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info)
+                        : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
 }
 
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+void ClMatMul::configure(const CLCompileContext    &compile_context,
+                         ITensorInfo               *lhs,
+                         ITensorInfo               *rhs,
+                         ITensorInfo               *dst,
+                         const MatMulInfo          &matmul_info,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -81,12 +92,13 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
 
     MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _matmul_lowp_native_kernel->set_target(gpu_target);
 
         // Configure the low-precision native matrix multiply kernel
-        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info,
+                                              act_info);
     }
     else
     {
@@ -99,7 +111,7 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
 
 void ClMatMul::run(ITensorPack &tensors)
 {
-    if(_is_quantized)
+    if (_is_quantized)
     {
         CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true);
     }
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index abbb75239a..64dcf217bd 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
@@ -73,7 +74,11 @@ public:
      * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  act_info        Class containing information about fused activation function.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &matmul_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -81,15 +86,19 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{ nullptr };
-    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{nullptr};
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
 
-    bool _is_quantized{ false };
+    bool _is_quantized{false};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
index 2066f0cfaa..10cf8a6a38 100644
--- a/src/gpu/cl/operators/ClMul.cpp
+++ b/src/gpu/cl/operators/ClMul.cpp
@@ -24,17 +24,23 @@
 #include "src/gpu/cl/operators/ClMul.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                      ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMul::configure(const CLCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      float                      scale,
+                      ConvertPolicy              overflow_policy,
+                      RoundingPolicy             rounding_policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
     auto k = std::make_unique<kernels::ClMulKernel>();
@@ -42,22 +48,34 @@ void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                       ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMul::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMul::configure(const CLCompileContext    &compile_context,
+                             ITensorInfo               *src1,
+                             ITensorInfo               *src2,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     auto k = std::make_unique<kernels::ClComplexMulKernel>();
     k->configure(compile_context, src1, src2, dst, act_info);
     _kernel = std::move(k);
 }
 
-Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMul::validate(const ITensorInfo         *src1,
+                              const ITensorInfo         *src2,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info)
 {
     return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h
index 6086bc9d52..1cf4d68d4c 100644
--- a/src/gpu/cl/operators/ClMul.h
+++ b/src/gpu/cl/operators/ClMul.h
@@ -66,16 +66,27 @@ public:
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
@@ -92,14 +103,21 @@ public:
      * @param[out]     dst             The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
index cf4ebe6083..f3efd00bba 100644
--- a/src/gpu/cl/operators/ClPRelu.cpp
+++ b/src/gpu/cl/operators/ClPRelu.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPRelu.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
 using KernelType = kernels::ClArithmeticKernel;
-void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
+void ClPRelu::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *input,
+                        ITensorInfo            *alpha,
+                        ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, alpha, output);
     auto k = std::make_unique<KernelType>();
@@ -49,7 +51,7 @@ void ClPRelu::run(ITensorPack &tensors)
 {
     // Output tensor can be given as nullptr for in-place computation.
     // In this case, get the input tensor and use it as the output tensor.
-    if(tensors.get_tensor(TensorType::ACL_DST) == nullptr)
+    if (tensors.get_tensor(TensorType::ACL_DST) == nullptr)
     {
         auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
         ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
@@ -58,4 +60,4 @@ void ClPRelu::run(ITensorPack &tensors)
     IClOperator::run(tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h
index 8084ab86cd..45ce858fb0 100644
--- a/src/gpu/cl/operators/ClPRelu.h
+++ b/src/gpu/cl/operators/ClPRelu.h
@@ -47,7 +47,8 @@ public:
      * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
      * @param[out] output          Destination tensor. Data type supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPRelu::configure()
diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
index ed56f97bfe..3851e22b6a 100644
--- a/src/gpu/cl/operators/ClPermute.cpp
+++ b/src/gpu/cl/operators/ClPermute.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPermute.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPermuteKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermute::configure(const ClCompileContext  &compile_context,
+                          const ITensorInfo       *src,
+                          ITensorInfo             *dst,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
     auto k = std::make_unique<kernels::ClPermuteKernel>();
@@ -45,4 +47,4 @@ Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const
     return kernels::ClPermuteKernel::validate(src, dst, perm);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h
index 3e87329f9b..6349358a18 100644
--- a/src/gpu/cl/operators/ClPermute.h
+++ b/src/gpu/cl/operators/ClPermute.h
@@ -44,7 +44,10 @@ public:
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPermute::configure()
@@ -55,4 +58,4 @@ public:
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PERMUTE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_PERMUTE_H */
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
index 3da90b8ced..e4507dc1a1 100644
--- a/src/gpu/cl/operators/ClPool2d.cpp
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -25,16 +25,19 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool2dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+void ClPool2d::configure(const ClCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *dst,
+                         const PoolingLayerInfo &info,
+                         ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices);
@@ -49,7 +52,10 @@ void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *s
     CLScheduler::get().tune_kernel_static(*_kernel);
 }
 
-Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+Status ClPool2d::validate(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &info,
+                          const ITensorInfo      *indices)
 {
     return kernels::ClPool2dKernel::validate(src, dst, info, indices);
 }
diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
index f353ba262e..9c2fd1c3f2 100644
--- a/src/gpu/cl/operators/ClPool2d.h
+++ b/src/gpu/cl/operators/ClPool2d.h
@@ -50,14 +50,21 @@ public:
      * @param[in]  info            Pooling layer parameters.
      * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &info,
+                           const ITensorInfo      *indices = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp
index 7dec6c5958..d230413659 100644
--- a/src/gpu/cl/operators/ClPool3d.cpp
+++ b/src/gpu/cl/operators/ClPool3d.cpp
@@ -25,16 +25,18 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool3dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool3d::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info)
+void ClPool3d::configure(const ClCompileContext   &compile_context,
+                         const ITensorInfo        *src,
+                         ITensorInfo              *dst,
+                         const Pooling3dLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h
index 7d994fd194..9fd78bfd69 100644
--- a/src/gpu/cl/operators/ClPool3d.h
+++ b/src/gpu/cl/operators/ClPool3d.h
@@ -51,7 +51,10 @@ public:
      * @param[out] dst             Destination tensor info.
      * @param[in]  info            3d Pooling layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3d::configure()
diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
index 47ae5cea47..8560b5553e 100644
--- a/src/gpu/cl/operators/ClQuantize.cpp
+++ b/src/gpu/cl/operators/ClQuantize.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
index 560966f4fc..1dd5b760cb 100644
--- a/src/gpu/cl/operators/ClReshape.cpp
+++ b/src/gpu/cl/operators/ClReshape.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClReshape.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
     return kernels::ClReshapeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
index 0798b19ca0..184e2aa006 100644
--- a/src/gpu/cl/operators/ClScale.cpp
+++ b/src/gpu/cl/operators/ClScale.cpp
@@ -25,17 +25,20 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScale::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *src,
+                        ITensorInfo            *dst,
+                        const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
@@ -61,4 +64,4 @@ void ClScale::run(ITensorPack &tensors)
     CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h
index af97cf23e7..1427bb4fdc 100644
--- a/src/gpu/cl/operators/ClScale.h
+++ b/src/gpu/cl/operators/ClScale.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,7 +50,8 @@ public:
      *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClScale::configure()
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
index 03809553a3..2bec400597 100644
--- a/src/gpu/cl/operators/ClSoftmax.cpp
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -22,7 +22,10 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/operators/ClSoftmax.h"
+
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
@@ -30,8 +33,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "support/Cast.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::experimental;
 
 namespace arm_compute
@@ -52,7 +53,10 @@ ClSoftmax::ClSoftmax()
 {
 }
 
-void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+void ClSoftmax::configure(const CLCompileContext  &compile_context,
+                          const ITensorInfo       &src,
+                          ITensorInfo             &dst,
+                          const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
@@ -64,14 +68,15 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor
     const ITensorInfo &tmp_input_info  = _needs_permute ? _permuted_src_info : src;
     ITensorInfo       &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
     }
 
-    DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
-    _tmp_info              = tmp_input_info.clone()->set_data_type(tmp_data_type);
+    DataType tmp_data_type =
+        is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
+    _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type);
 
     TensorShape max_sum_shape = tmp_input_info.tensor_shape();
     _max_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
@@ -83,33 +88,41 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor
     _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
     _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
     }
 
-    _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
-
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size());
+    _aux_mem[InternalTensorIdx::SUM] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
+    _aux_mem[InternalTensorIdx::MAX] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
+
+    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+                                                           MemoryLifetime::Temporary, _permuted_src_info.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+                                                           MemoryLifetime::Temporary, _permuted_dst_info.total_size());
 }
 
 Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(info.beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+                                static_cast<int32_t>(src.num_dimensions()) <= info.axis);
 
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
+    const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
     const bool   needs_permute = actual_axis != 0;
-    if(needs_permute)
+    if (needs_permute)
     {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
-        TensorInfo              input_permuted(src.clone()->set_tensor_shape(permuted_shape));
+        const PermutationVector permutation_vector =
+            softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape permuted_shape =
+            misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
+        TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
         TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
@@ -122,9 +135,14 @@ Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const
     TensorShape max_sum_shape = src.tensor_shape();
     max_sum_shape.set(0, 1);
     TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
-    TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
+    TensorInfo tensor_info_sum(src.clone()
+                                   ->set_tensor_shape(max_sum_shape)
+                                   .set_data_type(tmp_data_type)
+                                   .set_quantization_info(QuantizationInfo())
+                                   .set_is_resizable(true));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
 
     return Status{};
@@ -139,10 +157,12 @@ void ClSoftmax::run(ITensorPack &tensors)
     CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
     CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
 
-    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false);
-    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false);
+    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors,
+                                    false);
+    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors,
+                                    false);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack pack;
         pack.add_const_tensor(TensorType::ACL_SRC, src);
@@ -152,7 +172,7 @@ void ClSoftmax::run(ITensorPack &tensors)
 
     ITensorPack sum_pack;
     ITensorPack norm_pack;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
         norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
@@ -172,7 +192,7 @@ void ClSoftmax::run(ITensorPack &tensors)
     CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
     CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack pack;
         pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
@@ -186,4 +206,4 @@ experimental::MemoryRequirements ClSoftmax::workspace() const
     return _aux_mem;
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
index 6c9af585d6..6c2aaaea80 100644
--- a/src/gpu/cl/operators/ClSoftmax.h
+++ b/src/gpu/cl/operators/ClSoftmax.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SOFTMAX_H
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -52,7 +53,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClSoftmax::configure()
@@ -61,7 +65,7 @@ public:
      */
     static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -79,7 +83,7 @@ private:
     std::unique_ptr<ClPermute>                               _permute_output;
     std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
     std::unique_ptr<kernels::ClLogits1DNormKernel>           _norm_kernel;
-    bool                                                     _needs_permute{ false };
+    bool                                                     _needs_permute{false};
 
     TensorInfo _max_info;
     TensorInfo _sum_info;
@@ -90,6 +94,6 @@ private:
     experimental::MemoryRequirements _aux_mem{};
 };
 
-} // opencl
-} // arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
-\ No newline at end of file
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
index 53be04a70f..5c6d0c3184 100644
--- a/src/gpu/cl/operators/ClSub.cpp
+++ b/src/gpu/cl/operators/ClSub.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClSub.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClSub::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@ void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClSub::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
 }
diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h
index 7eac437143..6a97275b86 100644
--- a/src/gpu/cl/operators/ClSub.h
+++ b/src/gpu/cl/operators/ClSub.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SUB_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
index 26feffe2b9..28da0d640a 100644
--- a/src/gpu/cl/operators/ClTranspose.cpp
+++ b/src/gpu/cl/operators/ClTranspose.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClTranspose.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
     return kernels::ClTransposeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp
index 90dbe7f291..cec438faeb 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.cpp
+++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
 
@@ -32,8 +33,12 @@ namespace arm_compute
 {
 namespace opencl
 {
-void ClTransposedConvolution::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                        const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
+                                        const ITensorInfo      *input,
+                                        const ITensorInfo      *weights,
+                                        const ITensorInfo      *biases,
+                                        ITensorInfo            *output,
+                                        const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info);
@@ -43,10 +48,14 @@ void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
     _transposed_conv_kernel = std::move(kernel_object);
 }
 
-Status ClTransposedConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                                         const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolution::validate(const ITensorInfo   *input,
+                                         const ITensorInfo   *weights,
+                                         const ITensorInfo   *biases,
+                                         const ITensorInfo   *output,
+                                         const PadStrideInfo &deconv_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
     return Status{};
 }
 
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h
index 58ebc689ed..660c4f85c1 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.h
+++ b/src/gpu/cl/operators/ClTransposedConvolution.h
@@ -68,23 +68,30 @@ public:
      * @param[in]  deconv_info     Contains padding and stride information described in @ref PadStrideInfo.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _transposed_conv_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
index b4163a5986..8ec96b247e 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -24,20 +24,19 @@
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 using namespace arm_compute::experimental;
@@ -55,15 +54,16 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims,
     const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
 
     // Check if the input spatial dimensions are smaller than 4
-    const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+    const bool is_input_lt4_nchw =
+        (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
 
-    if(kernel_max_dim == 3U)
+    if (kernel_max_dim == 3U)
     {
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
         }
-        else if(kernel_dims == Size2D(3U, 1U))
+        else if (kernel_dims == Size2D(3U, 1U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
         }
@@ -72,15 +72,13 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims,
             output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
         }
     }
-    else if(kernel_max_dim == 5U)
+    else if (kernel_max_dim == 5U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
-                             kernel_dims.height == 1 ? 1U : 4U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U);
     }
-    else if(kernel_max_dim == 7U)
+    else if (kernel_max_dim == 7U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
-                             kernel_dims.height == 1 ? 1U : 2U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U);
     }
 
     return output_tile;
@@ -91,11 +89,9 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
     // Check if we want to configure a Winograd configuration which requires fast math
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> fast_math_winograd =
-    {
+    std::vector<WinogradConfiguration> fast_math_winograd = {
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
-    };
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))};
 
     auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
@@ -103,8 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
     return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
 }
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                          const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status validate_arguments(const ITensorInfo         *src,
+                          const ITensorInfo         *weights,
+                          const ITensorInfo         *biases,
+                          const ITensorInfo         *dst,
+                          const PadStrideInfo       &conv_info,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math)
 {
     // Get indeces for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
@@ -115,41 +116,49 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))),
+        "Winograd only supports padding up to half kernel size");
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+            src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                        "This Winograd configuration requires enable_fast_math=true");
     }
 
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-    const TensorInfo  input0       = src->clone()->set_tensor_shape(input0_shape);
+    const TensorShape input0_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
 
     // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+    const TensorShape input1_shape =
+        misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
 
     // Validate batched matrix multiply
     TensorShape batched_mm_output_shape = input0.tensor_shape();
     batched_mm_output_shape[0]          = input1.tensor_shape()[0];
     const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
-                                                                                                                     GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f,
+                         GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                  GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
 
     // Configure output transform
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
     return Status{};
 }
 
@@ -171,8 +180,14 @@ ClWinogradConv2d::ClWinogradConv2d()
 
 ClWinogradConv2d::~ClWinogradConv2d() = default;
 
-void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void ClWinogradConv2d::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
@@ -187,50 +202,53 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1,
+                                                      DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                 "This Winograd configuration requires enable_fast_math=true");
     }
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     _is_prepared = false;
 
     // Configure input transform
     _input_transform->configure(compile_context, src, &_input0, winograd_info);
-    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT,
+                              PixelValue());
 
     // Configure filter transform
     _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
-    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0,
-                                                                                                                  false, false,
-                                                                                                                  GEMMLowpOutputStageInfo(),
-                                                                                                                  (src->data_type() == DataType::F16)));
+    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f,
+                          GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                   GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)));
 
     // Configure output transform
     _output_transform->set_target(CLScheduler::get().target());
     _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
 
-    _aux_mem                             = _batched_mm.workspace();
-    const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
-    {
-        return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
-    }) ?
-    MemoryLifetime::Prepare :
-    MemoryLifetime::Persistent;
+    _aux_mem = _batched_mm.workspace();
+    const MemoryLifetime wino_wei_lifetm =
+        std::any_of(std::begin(_aux_mem), std::end(_aux_mem),
+                    [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); })
+            ? MemoryLifetime::Prepare
+            : MemoryLifetime::Persistent;
     _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
 }
 
-Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status ClWinogradConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     return Status{};
@@ -251,10 +269,9 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     prepare(tensors);
 
     // Run input transform
-    ITensorPack pack_it
-    {
-        { TensorType::ACL_SRC, src },
-        { TensorType::ACL_DST, input0.get() },
+    ITensorPack pack_it{
+        {TensorType::ACL_SRC, src},
+        {TensorType::ACL_DST, input0.get()},
     };
     CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
     CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
@@ -263,31 +280,31 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
     pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
-    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1)
+                     : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
     _batched_mm.run(pack_mm);
 
     // Run output transform
-    ITensorPack pack_ot
-    {
-        { TensorType::ACL_SRC_0, batched_mm_output.get() },
-        { TensorType::ACL_SRC_1, biases },
-        { TensorType::ACL_DST, dst },
+    ITensorPack pack_ot{
+        {TensorType::ACL_SRC_0, batched_mm_output.get()},
+        {TensorType::ACL_SRC_1, biases},
+        {TensorType::ACL_DST, dst},
     };
     CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
 }
 
 void ClWinogradConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        auto       weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        auto weights =
+            utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
         ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
 
         CLAuxTensorHandler input1(_input1, *in1_aux);
-        ITensorPack        pack_ft
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, input1.get() },
+        ITensorPack        pack_ft{
+            {TensorType::ACL_SRC, weights},
+            {TensorType::ACL_DST, input1.get()},
         };
         // Run filter transform and mark original weights as unused
         CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
@@ -308,4 +325,4 @@ experimental::MemoryRequirements ClWinogradConv2d::workspace() const
     return _aux_mem;
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h
index eb2f7a72b2..54ec1a1737 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.h
+++ b/src/gpu/cl/operators/ClWinogradConv2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRADCONV2D_H
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -41,7 +42,7 @@ namespace kernels
 class ClWinogradInputTransformKernel;
 class ClWinogradFilterTransformKernel;
 class ClWinogradOutputTransformKernel;
-} // kernels
+} // namespace kernels
 /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
  *
  *  -# @ref kernels::ClWinogradInputTransformKernel
@@ -93,20 +94,31 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h
index af383489a1..81dc3baef4 100644
--- a/src/gpu/cl/utils/ClAuxTensorHandler.h
+++ b/src/gpu/cl/utils/ClAuxTensorHandler.h
@@ -39,25 +39,26 @@ namespace opencl
 class CLAuxTensorHandler
 {
 public:
-    CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    CLAuxTensorHandler(
+        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
-        if(info.total_size() == 0)
+        if (info.total_size() == 0)
         {
             return;
         }
         _tensor.allocator()->soft_init(info);
 
         ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            if(!bypass_alloc)
+            if (!bypass_alloc)
             {
                 _tensor.allocator()->allocate();
                 ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
             }
 
-            if(pack_inject)
+            if (pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);
                 _injected_tensor_pack = &pack;
@@ -70,22 +71,21 @@ public:
         }
     }
 
-    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor)
-        : _tensor()
+    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
+        if (info.total_size() <= tensor.info()->total_size())
         {
             _tensor.allocator()->import_memory(tensor.cl_buffer());
         }
     }
 
-    CLAuxTensorHandler(const CLAuxTensorHandler &) = delete;
+    CLAuxTensorHandler(const CLAuxTensorHandler &)          = delete;
     CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete;
 
     ~CLAuxTensorHandler()
     {
-        if(_injected_tensor_pack)
+        if (_injected_tensor_pack)
         {
             _injected_tensor_pack->remove_tensor(_injected_slot_id);
         }
@@ -103,9 +103,9 @@ public:
 
 private:
     CLTensor     _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */